Browse Source

Merge pull request #134 from llhuii/refactor-gm

gm: decouple all features into independent package
tags/v0.3.1
KubeEdge Bot GitHub 4 years ago
parent
commit
0c0e7cd337
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 2217 additions and 1948 deletions
  1. +8
    -3
      cmd/sedna-gm/app/controller.go
  2. +4
    -0
      cmd/sedna-gm/sedna-gm.go
  3. +0
    -71
      pkg/globalmanager/controller.go
  4. +74
    -0
      pkg/globalmanager/controllers/dataset/dataset.go
  5. +54
    -0
      pkg/globalmanager/controllers/dataset/downstream.go
  6. +62
    -0
      pkg/globalmanager/controllers/dataset/upstream.go
  7. +56
    -0
      pkg/globalmanager/controllers/federatedlearning/downstream.go
  8. +176
    -170
      pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go
  9. +123
    -0
      pkg/globalmanager/controllers/federatedlearning/upstream.go
  10. +145
    -0
      pkg/globalmanager/controllers/incrementallearning/downstream.go
  11. +221
    -243
      pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go
  12. +162
    -0
      pkg/globalmanager/controllers/incrementallearning/upstream.go
  13. +56
    -0
      pkg/globalmanager/controllers/jointinference/downstream.go
  14. +138
    -151
      pkg/globalmanager/controllers/jointinference/jointinferenceservice.go
  15. +92
    -0
      pkg/globalmanager/controllers/jointinference/upstream.go
  16. +55
    -0
      pkg/globalmanager/controllers/lifelonglearning/downstream.go
  17. +169
    -196
      pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go
  18. +164
    -0
      pkg/globalmanager/controllers/lifelonglearning/upstream.go
  19. +128
    -0
      pkg/globalmanager/controllers/manager.go
  20. +40
    -0
      pkg/globalmanager/controllers/registry.go
  21. +105
    -0
      pkg/globalmanager/controllers/upstream.go
  22. +0
    -388
      pkg/globalmanager/downstream.go
  23. +42
    -11
      pkg/globalmanager/runtime/common.go
  24. +14
    -3
      pkg/globalmanager/runtime/secret_injector.go
  25. +7
    -7
      pkg/globalmanager/runtime/storage_initializer_injector.go
  26. +103
    -0
      pkg/globalmanager/runtime/types.go
  27. +14
    -14
      pkg/globalmanager/runtime/worker.go
  28. +0
    -168
      pkg/globalmanager/types.go
  29. +0
    -519
      pkg/globalmanager/upstream.go
  30. +5
    -4
      pkg/localcontroller/manager/incrementallearningjob.go

+ 8
- 3
cmd/sedna-gm/app/controller.go View File

@@ -18,6 +18,7 @@ package app


import ( import (
"fmt" "fmt"
"os"


"github.com/spf13/cobra" "github.com/spf13/cobra"
"github.com/spf13/pflag" "github.com/spf13/pflag"
@@ -27,7 +28,7 @@ import (
"k8s.io/klog/v2" "k8s.io/klog/v2"


"github.com/kubeedge/sedna/cmd/sedna-gm/app/options" "github.com/kubeedge/sedna/cmd/sedna-gm/app/options"
controller "github.com/kubeedge/sedna/pkg/globalmanager"
controller "github.com/kubeedge/sedna/pkg/globalmanager/controllers"
"github.com/kubeedge/sedna/pkg/util" "github.com/kubeedge/sedna/pkg/util"
"github.com/kubeedge/sedna/pkg/version/verflag" "github.com/kubeedge/sedna/pkg/version/verflag"
) )
@@ -61,8 +62,12 @@ func NewControllerCommand() *cobra.Command {
if errs := config.Validate(); len(errs) > 0 { if errs := config.Validate(); len(errs) > 0 {
klog.Fatal(util.SpliceErrors(errs.ToAggregate().Errors())) klog.Fatal(util.SpliceErrors(errs.ToAggregate().Errors()))
} }
c := controller.NewController(config)
c.Start()
c := controller.New(config)
err = c.Start()
if err != nil {
klog.Errorf("failed to start controller: %v", err)
os.Exit(1)
}
}, },
} }
fs := cmd.Flags() fs := cmd.Flags()


+ 4
- 0
cmd/sedna-gm/sedna-gm.go View File

@@ -17,7 +17,9 @@ limitations under the License.
package main package main


import ( import (
"math/rand"
"os" "os"
"time"


"k8s.io/component-base/logs" "k8s.io/component-base/logs"


@@ -25,6 +27,8 @@ import (
) )


func main() { func main() {
rand.Seed(time.Now().UnixNano())

command := app.NewControllerCommand() command := app.NewControllerCommand()
logs.InitLogs() logs.InitLogs()
defer logs.FlushLogs() defer logs.FlushLogs()


+ 0
- 71
pkg/globalmanager/controller.go View File

@@ -1,71 +0,0 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package globalmanager

import (
"fmt"
"os"

"k8s.io/klog/v2"

"github.com/kubeedge/sedna/pkg/globalmanager/config"
websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws"
)

// MainController defines the main controller
type MainController struct {
Config *config.ControllerConfig
}

// NewController creates a new main controller
func NewController(cc *config.ControllerConfig) *MainController {
config.InitConfigure(cc)
return &MainController{
Config: cc,
}
}

// Start starts the main controller
func (c *MainController) Start() {
type newFunc func(cfg *config.ControllerConfig) (FeatureControllerI, error)

for _, featureFunc := range []newFunc{
NewUpstreamController,
NewDownstreamController,
NewFederatedController,
NewJointController,
NewIncrementalJobController,
NewLifelongLearningJobController,
} {
f, _ := featureFunc(c.Config)
err := f.Start()
if err != nil {
klog.Warningf("failed to start controller %s: %+v", f.GetName(), err)
} else {
klog.Infof("started controller %s", f.GetName())
}
}

addr := fmt.Sprintf("%s:%d", c.Config.WebSocket.Address, c.Config.WebSocket.Port)

ws := websocket.NewServer(addr)
err := ws.ListenAndServe()
if err != nil {
klog.Fatalf("failed to listen websocket at %s", addr)
os.Exit(1)
}
}

+ 74
- 0
pkg/globalmanager/controllers/dataset/dataset.go View File

@@ -0,0 +1,74 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package dataset

import (
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"

sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

const (
// KindName is the kind name of CR this controller controls
KindName = "Dataset"

// Name is this controller name
Name = "Dataset"
)

// Controller handles all dataset objects including: syncing to edge and update from edge.
type Controller struct {
kubeClient kubernetes.Interface
client sednaclientset.SednaV1alpha1Interface

cfg *config.ControllerConfig

sendToEdgeFunc runtime.DownstreamSendFunc
}

func (c *Controller) Run(stopCh <-chan struct{}) {
// noop now
}

// New creates a dataset controller
func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
c := &Controller{
client: cc.SednaClient.SednaV1alpha1(),
kubeClient: cc.KubeClient,
}
informer := cc.SednaInformerFactory.Sedna().V1alpha1().Datasets().Informer()
informer.AddEventHandler(cache.ResourceEventHandlerFuncs{

AddFunc: func(obj interface{}) {
c.syncToEdge(watch.Added, obj)
},

UpdateFunc: func(old, cur interface{}) {
c.syncToEdge(watch.Added, cur)
},

DeleteFunc: func(obj interface{}) {
c.syncToEdge(watch.Deleted, obj)
},
})

return c, nil
}

+ 54
- 0
pkg/globalmanager/controllers/dataset/downstream.go View File

@@ -0,0 +1,54 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package dataset

import (
"fmt"

"k8s.io/apimachinery/pkg/watch"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

// syncToEdge syncs the dataset resources
func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error {
dataset, ok := obj.(*sednav1.Dataset)
if !ok {
return nil
}

// Since t.Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
dataset.Kind = KindName

// Here only propagate to the nodes with non empty name
nodeName := dataset.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}

runtime.InjectSecretAnnotations(c.kubeClient, dataset, dataset.Spec.CredentialName)

return c.sendToEdgeFunc(nodeName, eventType, dataset)
}

func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error {
c.sendToEdgeFunc = f
return nil
}

+ 62
- 0
pkg/globalmanager/controllers/dataset/upstream.go View File

@@ -0,0 +1,62 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package dataset

import (
"context"
"encoding/json"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

// updateFromEdge syncs update from edge
func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error {
status := sednav1.DatasetStatus{}
err := json.Unmarshal(content, &status)
if err != nil {
return err
}

return c.updateStatus(name, namespace, status)
}

// updateStatus updates the dataset status
func (c *Controller) updateStatus(name, namespace string, status sednav1.DatasetStatus) error {
client := c.client.Datasets(namespace)

if status.UpdateTime == nil {
now := metav1.Now()
status.UpdateTime = &now
}

return runtime.RetryUpdateStatus(name, namespace, func() error {
dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
dataset.Status = status
_, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{})
return err
})
}

func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error {
return addFunc(KindName, c.updateFromEdge)
}

+ 56
- 0
pkg/globalmanager/controllers/federatedlearning/downstream.go View File

@@ -0,0 +1,56 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package federatedlearning

import (
"k8s.io/apimachinery/pkg/watch"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error {
job, ok := obj.(*sednav1.FederatedLearningJob)
if !ok {
return nil
}

// Since Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
job.Kind = KindName

// broadcast to all nodes specified in spec
nodeset := make(map[string]bool)
for _, trainingWorker := range job.Spec.TrainingWorkers {
// Here only propagate to the nodes with non empty name
if len(trainingWorker.Template.Spec.NodeName) > 0 {
nodeset[trainingWorker.Template.Spec.NodeName] = true
}
}

for nodeName := range nodeset {
c.sendToEdgeFunc(nodeName, eventType, job)
}
return nil
}

func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error {
c.sendToEdgeFunc = f

return nil
}

pkg/globalmanager/federatedlearningjob.go → pkg/globalmanager/controllers/federatedlearning/federatedlearningjob.go View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package federatedlearning


import ( import (
"context" "context"
@@ -28,7 +28,7 @@ import (
utilrand "k8s.io/apimachinery/pkg/util/rand" utilrand "k8s.io/apimachinery/pkg/util/rand"
utilruntime "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1" v1core "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -40,33 +40,37 @@ import (
k8scontroller "k8s.io/kubernetes/pkg/controller" k8scontroller "k8s.io/kubernetes/pkg/controller"


sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned"
sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions"
sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/config"
messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
) )


const ( const (
FLJobStageAgg = "Aggregation"
FLJobStageTrain = "Training"
// KindName is the kind name of CR this controller controls
KindName = "FederatedLearningJob"
// Name is this controller name
Name = "FederatedLearning"
) )


// flJobControllerKind contains the schema.GroupVersionKind for this controller type.
var flJobControllerKind = sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob")
const (
jobStageAgg = "Aggregation"
jobStageTrain = "Training"
)

// Kind contains the schema.GroupVersionKind for this controller type.
var Kind = sednav1.SchemeGroupVersion.WithKind(KindName)


// FederatedController ensures that all FLJob objects have corresponding pods to
// Controller ensures that all FederatedLearningJob objects have corresponding pods to
// run their configured workload. // run their configured workload.
type FederatedController struct {
type Controller struct {
kubeClient kubernetes.Interface kubeClient kubernetes.Interface
client sednaclientset.SednaV1alpha1Interface client sednaclientset.SednaV1alpha1Interface


// podStoreSynced returns true if the pod store has been synced at least once. // podStoreSynced returns true if the pod store has been synced at least once.
// Added as a member to the struct to allow injection for testing. // Added as a member to the struct to allow injection for testing.
podStoreSynced cache.InformerSynced podStoreSynced cache.InformerSynced
// jobStoreSynced returns true if the flJob store has been synced at least once.
// jobStoreSynced returns true if the FederatedLearningJob store has been synced at least once.
// Added as a member to the struct to allow injection for testing. // Added as a member to the struct to allow injection for testing.
jobStoreSynced cache.InformerSynced jobStoreSynced cache.InformerSynced


@@ -82,48 +86,47 @@ type FederatedController struct {
recorder record.EventRecorder recorder record.EventRecorder


cfg *config.ControllerConfig cfg *config.ControllerConfig

sendToEdgeFunc runtime.DownstreamSendFunc
} }


// Run the main goroutine responsible for watching and syncing jobs.
func (fc *FederatedController) Start() error {
// Run starts the main goroutine responsible for watching and syncing jobs.
func (c *Controller) Run(stopCh <-chan struct{}) {
workers := 1 workers := 1
stopCh := messageContext.Done()


go func() {
defer utilruntime.HandleCrash()
defer fc.queue.ShutDown()
klog.Infof("Starting federatedlearning job controller")
defer klog.Infof("Shutting down federatedlearning job controller")
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()


if !cache.WaitForNamedCacheSync("federatedlearning job", stopCh, fc.podStoreSynced, fc.jobStoreSynced) {
klog.Errorf("failed to wait for caches to sync")
klog.Infof("Starting %s controller", Name)
defer klog.Infof("Shutting down %s controller", Name)


return
}
if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) {
klog.Errorf("failed to wait for %s caches to sync", Name)


klog.Infof("Starting federatedlearning job workers")
for i := 0; i < workers; i++ {
go wait.Until(fc.worker, time.Second, stopCh)
}
return
}


<-stopCh
}()
return nil
klog.Infof("Starting %s workers", Name)
for i := 0; i < workers; i++ {
go wait.Until(c.worker, time.Second, stopCh)
}

<-stopCh
} }


// enqueueByPod enqueues the FederatedLearningJob object of the specified pod. // enqueueByPod enqueues the FederatedLearningJob object of the specified pod.
func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) {
func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
controllerRef := metav1.GetControllerOf(pod) controllerRef := metav1.GetControllerOf(pod)


if controllerRef == nil { if controllerRef == nil {
return return
} }


if controllerRef.Kind != flJobControllerKind.Kind {
if controllerRef.Kind != Kind.Kind {
return return
} }


job, err := fc.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name)
job, err := c.jobLister.FederatedLearningJobs(pod.Namespace).Get(controllerRef.Name)
if err != nil { if err != nil {
return return
} }
@@ -132,27 +135,27 @@ func (fc *FederatedController) enqueueByPod(pod *v1.Pod, immediate bool) {
return return
} }


fc.enqueueController(job, immediate)
c.enqueueController(job, immediate)
} }


// When a pod is created, enqueue the controller that manages it and update it's expectations. // When a pod is created, enqueue the controller that manages it and update it's expectations.
func (fc *FederatedController) addPod(obj interface{}) {
func (c *Controller) addPod(obj interface{}) {
pod := obj.(*v1.Pod) pod := obj.(*v1.Pod)
if pod.DeletionTimestamp != nil { if pod.DeletionTimestamp != nil {
// on a restart of the controller, it's possible a new pod shows up in a state that // on a restart of the controller, it's possible a new pod shows up in a state that
// is already pending deletion. Prevent the pod from being a creation observation. // is already pending deletion. Prevent the pod from being a creation observation.
fc.deletePod(pod)
c.deletePod(pod)
return return
} }


// backoff to queue when PodFailed // backoff to queue when PodFailed
immediate := pod.Status.Phase != v1.PodFailed immediate := pod.Status.Phase != v1.PodFailed


fc.enqueueByPod(pod, immediate)
c.enqueueByPod(pod, immediate)
} }


// When a pod is updated, figure out what federatedlearning job manage it and wake them up. // When a pod is updated, figure out what federatedlearning job manage it and wake them up.
func (fc *FederatedController) updatePod(old, cur interface{}) {
func (c *Controller) updatePod(old, cur interface{}) {
curPod := cur.(*v1.Pod) curPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod) oldPod := old.(*v1.Pod)


@@ -161,11 +164,11 @@ func (fc *FederatedController) updatePod(old, cur interface{}) {
return return
} }


fc.addPod(curPod)
c.addPod(curPod)
} }


// deletePod enqueues the FederatedLearningJob obj When a pod is deleted // deletePod enqueues the FederatedLearningJob obj When a pod is deleted
func (fc *FederatedController) deletePod(obj interface{}) {
func (c *Controller) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod) pod, ok := obj.(*v1.Pod)


// comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
@@ -186,13 +189,13 @@ func (fc *FederatedController) deletePod(obj interface{}) {
return return
} }
} }
fc.enqueueByPod(pod, true)
c.enqueueByPod(pod, true)
} }


// obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item, // obj could be an *sednav1.FederatedLearningJob, or a DeletionFinalStateUnknown marker item,
// immediate tells the controller to update the status right away, and should // immediate tells the controller to update the status right away, and should
// happen ONLY when there was a successful pod run. // happen ONLY when there was a successful pod run.
func (fc *FederatedController) enqueueController(obj interface{}, immediate bool) {
func (c *Controller) enqueueController(obj interface{}, immediate bool) {
key, err := k8scontroller.KeyFunc(obj) key, err := k8scontroller.KeyFunc(obj)
if err != nil { if err != nil {
klog.Warningf("Couldn't get key for object %+v: %v", obj, err) klog.Warningf("Couldn't get key for object %+v: %v", obj, err)
@@ -201,43 +204,43 @@ func (fc *FederatedController) enqueueController(obj interface{}, immediate bool


backoff := time.Duration(0) backoff := time.Duration(0)
if !immediate { if !immediate {
backoff = getBackoff(fc.queue, key)
backoff = runtime.GetBackoff(c.queue, key)
} }
fc.queue.AddAfter(key, backoff)
c.queue.AddAfter(key, backoff)
} }


// worker runs a worker thread that just dequeues items, processes them, and marks them done. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key. // It enforces that the syncHandler is never invoked concurrently with the same key.
func (fc *FederatedController) worker() {
for fc.processNextWorkItem() {
func (c *Controller) worker() {
for c.processNextWorkItem() {
} }
} }


func (fc *FederatedController) processNextWorkItem() bool {
key, quit := fc.queue.Get()
func (c *Controller) processNextWorkItem() bool {
key, quit := c.queue.Get()
if quit { if quit {
return false return false
} }
defer fc.queue.Done(key)
defer c.queue.Done(key)


forget, err := fc.syncFLJob(key.(string))
forget, err := c.sync(key.(string))
if err == nil { if err == nil {
if forget { if forget {
fc.queue.Forget(key)
c.queue.Forget(key)
} }
return true return true
} }


klog.Warningf("Error syncing federatedlearning job: %v", err) klog.Warningf("Error syncing federatedlearning job: %v", err)
fc.queue.AddRateLimited(key)
c.queue.AddRateLimited(key)


return true return true
} }


// syncFLJob will sync the flJob with the given key if it has had its expectations fulfilled, meaning
// sync will sync the FederatedLearningJob with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key. // concurrently with the same key.
func (fc *FederatedController) syncFLJob(key string) (bool, error) {
func (c *Controller) sync(key string) (bool, error) {
startTime := time.Now() startTime := time.Now()
defer func() { defer func() {
klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime)) klog.V(4).Infof("Finished syncing federatedlearning job %q (%v)", key, time.Since(startTime))
@@ -250,91 +253,96 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) {
if len(ns) == 0 || len(name) == 0 { if len(ns) == 0 || len(name) == 0 {
return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key) return false, fmt.Errorf("invalid federatedlearning job key %q: either namespace or name is missing", key)
} }
sharedFLJob, err := fc.jobLister.FederatedLearningJobs(ns).Get(name)
sharedJob, err := c.jobLister.FederatedLearningJobs(ns).Get(name)
if err != nil { if err != nil {
if errors.IsNotFound(err) { if errors.IsNotFound(err) {
klog.V(4).Infof("FLJob has been deleted: %v", key)
klog.V(4).Infof("%s %v has been deleted", Name, key)
return true, nil return true, nil
} }
return false, err return false, err
} }
flJob := *sharedFLJob
// set kind for flJob in case that the kind is None
flJob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("FederatedLearningJob"))
// if flJob was finished previously, we don't want to redo the termination
if IsFLJobFinished(&flJob) {

job := *sharedJob
// set kind for FederatedLearningJob in case that the kind is None
job.SetGroupVersionKind(Kind)

// if job was finished previously, we don't want to redo the termination
if IsJobFinished(&job) {
return true, nil return true, nil
} }
selector, _ := GenerateSelector(&flJob)
pods, err := fc.podStore.Pods(flJob.Namespace).List(selector)

selector, _ := runtime.GenerateSelector(&job)
pods, err := c.podStore.Pods(job.Namespace).List(selector)
if err != nil { if err != nil {
return false, err return false, err
} }


activePods := k8scontroller.FilterActivePods(pods) activePods := k8scontroller.FilterActivePods(pods)
active := int32(len(activePods)) active := int32(len(activePods))
succeeded, failed := getStatus(pods)
conditions := len(flJob.Status.Conditions)
// flJob first start
if flJob.Status.StartTime == nil {
succeeded, failed := countPods(pods)
conditions := len(job.Status.Conditions)

// set StartTime when job is handled firstly
if job.Status.StartTime == nil {
now := metav1.Now() now := metav1.Now()
flJob.Status.StartTime = &now
job.Status.StartTime = &now
} }


var manageJobErr error var manageJobErr error
jobFailed := false jobFailed := false
var failureReason string var failureReason string
var failureMessage string var failureMessage string
phase := flJob.Status.Phase
phase := job.Status.Phase


if failed > 0 { if failed > 0 {
jobFailed = true jobFailed = true
failureReason = "workerFailed" failureReason = "workerFailed"
failureMessage = "the worker of FLJob failed"
failureMessage = "the worker of FederatedLearningJob failed"
} }


if jobFailed { if jobFailed {
flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage))
flJob.Status.Phase = sednav1.FLJobFailed
fc.recorder.Event(&flJob, v1.EventTypeWarning, failureReason, failureMessage)
job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondFailed, failureReason, failureMessage))
job.Status.Phase = sednav1.FLJobFailed
c.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage)
} else { } else {
// in the First time, we create the pods // in the First time, we create the pods
if len(pods) == 0 { if len(pods) == 0 {
active, manageJobErr = fc.createPod(&flJob)
active, manageJobErr = c.createPod(&job)
} }
complete := false complete := false
if succeeded > 0 && active == 0 { if succeeded > 0 && active == 0 {
complete = true complete = true
} }
if complete { if complete {
flJob.Status.Conditions = append(flJob.Status.Conditions, NewFLJobCondition(sednav1.FLJobCondComplete, "", ""))
job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(sednav1.FLJobCondComplete, "", ""))
now := metav1.Now() now := metav1.Now()
flJob.Status.CompletionTime = &now
fc.recorder.Event(&flJob, v1.EventTypeNormal, "Completed", "FLJob completed")
flJob.Status.Phase = sednav1.FLJobSucceeded
job.Status.CompletionTime = &now
c.recorder.Event(&job, v1.EventTypeNormal, "Completed", "FederatedLearningJob completed")
job.Status.Phase = sednav1.FLJobSucceeded
} else { } else {
flJob.Status.Phase = sednav1.FLJobRunning
job.Status.Phase = sednav1.FLJobRunning
} }
} }


forget := false forget := false
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true // Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to // This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
// improve the FLJob backoff policy when parallelism > 1 and few FLJobs failed but others succeed.
// improve the job backoff policy when parallelism > 1 and few FLJobs failed but others succeed.
// In this case, we should clear the backoff delay. // In this case, we should clear the backoff delay.
if flJob.Status.Succeeded < succeeded {
if job.Status.Succeeded < succeeded {
forget = true forget = true
} }


// no need to update the flJob if the status hasn't changed since last time
if flJob.Status.Active != active || flJob.Status.Succeeded != succeeded || flJob.Status.Failed != failed || len(flJob.Status.Conditions) != conditions || flJob.Status.Phase != phase {
flJob.Status.Active = active
flJob.Status.Succeeded = succeeded
flJob.Status.Failed = failed
// no need to update the job if the status hasn't changed since last time
if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions || job.Status.Phase != phase {
job.Status.Active = active
job.Status.Succeeded = succeeded
job.Status.Failed = failed
c.updateJobStatus(&job)


if jobFailed && !IsFLJobFinished(&flJob) {
// returning an error will re-enqueue FLJob after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for flJob key %q", key)
if jobFailed && !IsJobFinished(&job) {
// returning an error will re-enqueue FederatedLearningJob after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for FederatedLearningJob key %q", key)
} }


forget = true forget = true
@@ -343,7 +351,7 @@ func (fc *FederatedController) syncFLJob(key string) (bool, error) {
return forget, manageJobErr return forget, manageJobErr
} }


func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition {
func NewJobCondition(conditionType sednav1.FLJobConditionType, reason, message string) sednav1.FLJobCondition {
return sednav1.FLJobCondition{ return sednav1.FLJobCondition{
Type: conditionType, Type: conditionType,
Status: v1.ConditionTrue, Status: v1.ConditionTrue,
@@ -354,28 +362,24 @@ func NewFLJobCondition(conditionType sednav1.FLJobConditionType, reason, message
} }
} }


// getStatus returns no of succeeded and failed pods running a flJob
func getStatus(pods []*v1.Pod) (succeeded, failed int32) {
// countPods returns number of succeeded and failed pods
func countPods(pods []*v1.Pod) (succeeded, failed int32) {
succeeded = int32(filterPods(pods, v1.PodSucceeded)) succeeded = int32(filterPods(pods, v1.PodSucceeded))
failed = int32(filterPods(pods, v1.PodFailed)) failed = int32(filterPods(pods, v1.PodFailed))
return return
} }


func (fc *FederatedController) updateFLJobStatus(flJob *sednav1.FederatedLearningJob) error {
jobClient := fc.client.FederatedLearningJobs(flJob.Namespace)
var err error
for i := 0; i <= ResourceUpdateRetries; i = i + 1 {
var newFLJob *sednav1.FederatedLearningJob
newFLJob, err = jobClient.Get(context.TODO(), flJob.Name, metav1.GetOptions{})
func (c *Controller) updateJobStatus(job *sednav1.FederatedLearningJob) error {
jobClient := c.client.FederatedLearningJobs(job.Namespace)
return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
if err != nil { if err != nil {
break
}
newFLJob.Status = flJob.Status
if _, err = jobClient.UpdateStatus(context.TODO(), newFLJob, metav1.UpdateOptions{}); err == nil {
break
return err
} }
}
return nil
newJob.Status = job.Status
_, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{})
return err
})
} }


// filterPods returns pods based on their phase. // filterPods returns pods based on their phase.
@@ -389,7 +393,7 @@ func filterPods(pods []*v1.Pod, phase v1.PodPhase) int {
return result return result
} }


func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool {
func IsJobFinished(j *sednav1.FederatedLearningJob) bool {
for _, c := range j.Status.Conditions { for _, c := range j.Status.Conditions {
if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue { if (c.Type == sednav1.FLJobCondComplete || c.Type == sednav1.FLJobCondFailed) && c.Status == v1.ConditionTrue {
return true return true
@@ -398,12 +402,12 @@ func IsFLJobFinished(j *sednav1.FederatedLearningJob) bool {
return false return false
} }


func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) {
func (c *Controller) createPod(job *sednav1.FederatedLearningJob) (active int32, err error) {
active = 0 active = 0
ctx := context.Background() ctx := context.Background()


modelName := job.Spec.AggregationWorker.Model.Name modelName := job.Spec.AggregationWorker.Model.Name
model, err := fc.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{})
model, err := c.client.Models(job.Namespace).Get(ctx, modelName, metav1.GetOptions{})
if err != nil { if err != nil {
return active, fmt.Errorf("failed to get model %s: %w", return active, fmt.Errorf("failed to get model %s: %w",
modelName, err) modelName, err)
@@ -412,7 +416,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
secretName := model.Spec.CredentialName secretName := model.Spec.CredentialName
var modelSecret *v1.Secret var modelSecret *v1.Secret
if secretName != "" { if secretName != "" {
modelSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
modelSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
} }


participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers)) participantsCount := strconv.Itoa(len(job.Spec.TrainingWorkers))
@@ -420,10 +424,10 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
// deliver pod for aggregation worker // deliver pod for aggregation worker
aggWorker := job.Spec.AggregationWorker aggWorker := job.Spec.AggregationWorker


// Configure container mounting and Env information by initial WorkerParam
// Configure aggregation worker's mounts and envs
var aggPort int32 = 7363 var aggPort int32 = 7363
var aggWorkerParam *WorkerParam = new(WorkerParam)
aggWorkerParam.env = map[string]string{
var aggWorkerParam runtime.WorkerParam
aggWorkerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"WORKER_NAME": "aggworker-" + utilrand.String(5), "WORKER_NAME": "aggworker-" + utilrand.String(5),
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
@@ -432,12 +436,12 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
"PARTICIPANTS_COUNT": participantsCount, "PARTICIPANTS_COUNT": participantsCount,
} }


aggWorkerParam.workerType = FLJobStageAgg
aggWorkerParam.restartPolicy = v1.RestartPolicyOnFailure
aggWorkerParam.WorkerType = jobStageAgg
aggWorkerParam.RestartPolicy = v1.RestartPolicyOnFailure


aggWorkerParam.mounts = append(aggWorkerParam.mounts,
WorkerMount{
URL: &MountURL{
aggWorkerParam.Mounts = append(aggWorkerParam.Mounts,
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: model.Spec.URL, URL: model.Spec.URL,
Secret: modelSecret, Secret: modelSecret,
DownloadByInitializer: false, DownloadByInitializer: false,
@@ -447,9 +451,9 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
) )


// create aggpod based on configured parameters // create aggpod based on configured parameters
_, err = createPodWithTemplate(fc.kubeClient, job, &aggWorker.Template, aggWorkerParam)
_, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &aggWorker.Template, &aggWorkerParam)
if err != nil { if err != nil {
return active, err
return active, fmt.Errorf("failed to create aggregation worker: %w", err)
} }
active++ active++


@@ -458,17 +462,21 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act


// FIXME(llhuii): only the case that Spec.NodeName specified is support, // FIXME(llhuii): only the case that Spec.NodeName specified is support,
// will support Spec.NodeSelector. // will support Spec.NodeSelector.
appIP, err = GetNodeIPByName(fc.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName)
appIP, err = runtime.GetNodeIPByName(c.kubeClient, job.Spec.AggregationWorker.Template.Spec.NodeName)
if err != nil {
return active, err
}


aggServicePort, err = CreateKubernetesService(fc.kubeClient, job, FLJobStageAgg, aggPort, appIP)
aggServicePort, err = runtime.CreateKubernetesService(c.kubeClient, job, jobStageAgg, aggPort, appIP)
if err != nil { if err != nil {
return active, err return active, err
} }

// deliver pod for training worker // deliver pod for training worker
for _, trainingWorker := range job.Spec.TrainingWorkers {
for i, trainingWorker := range job.Spec.TrainingWorkers {
// get dataseturl through parsing crd of dataset // get dataseturl through parsing crd of dataset
datasetName := trainingWorker.Dataset.Name datasetName := trainingWorker.Dataset.Name
dataset, err := fc.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{})
dataset, err := c.client.Datasets(job.Namespace).Get(ctx, datasetName, metav1.GetOptions{})
if err != nil { if err != nil {
return active, fmt.Errorf("failed to get dataset %s: %w", return active, fmt.Errorf("failed to get dataset %s: %w",
datasetName, err) datasetName, err)
@@ -477,23 +485,22 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
secretName := dataset.Spec.CredentialName secretName := dataset.Spec.CredentialName
var datasetSecret *v1.Secret var datasetSecret *v1.Secret
if secretName != "" { if secretName != "" {
datasetSecret, _ = fc.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
datasetSecret, _ = c.kubeClient.CoreV1().Secrets(job.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
} }


// Configure container mounting and Env information by initial WorkerParam
var workerParam *WorkerParam = new(WorkerParam)

workerParam.mounts = append(workerParam.mounts,
WorkerMount{
URL: &MountURL{
// Configure training worker's mounts and envs
var workerParam runtime.WorkerParam
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: model.Spec.URL, URL: model.Spec.URL,
Secret: modelSecret, Secret: modelSecret,
}, },
EnvName: "MODEL_URL", EnvName: "MODEL_URL",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: dataset.Spec.URL, URL: dataset.Spec.URL,
Secret: datasetSecret, Secret: datasetSecret,
}, },
@@ -501,7 +508,7 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
}, },
) )


workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"AGG_PORT": strconv.Itoa(int(aggServicePort)), "AGG_PORT": strconv.Itoa(int(aggServicePort)),
"AGG_IP": appIP, "AGG_IP": appIP,


@@ -511,65 +518,67 @@ func (fc *FederatedController) createPod(job *sednav1.FederatedLearningJob) (act
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"MODEL_NAME": modelName, "MODEL_NAME": modelName,
"DATASET_NAME": datasetName, "DATASET_NAME": datasetName,
"LC_SERVER": fc.cfg.LC.Server,
"LC_SERVER": c.cfg.LC.Server,
} }
workerParam.workerType = TrainPodType
workerParam.hostNetwork = true
workerParam.restartPolicy = v1.RestartPolicyOnFailure
// create train pod based on configured parameters
_, err = createPodWithTemplate(fc.kubeClient, job, &trainingWorker.Template, workerParam)
workerParam.WorkerType = runtime.TrainPodType
workerParam.HostNetwork = true
workerParam.RestartPolicy = v1.RestartPolicyOnFailure

// create training worker based on configured parameters
_, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &trainingWorker.Template, &workerParam)
if err != nil { if err != nil {
return active, err
return active, fmt.Errorf("failed to create %dth training worker: %w", i, err)
} }
active++ active++
} }
return return
} }


func (fc *FederatedController) GetName() string {
return "FederatedLearningJobController"
}

// NewFederatedController creates a new FederatedLearningJob controller that keeps the relevant pods
// in sync with their corresponding FFederatedLearningJob objects.
func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, error) {
namespace := cfg.Namespace
if namespace == "" {
namespace = metav1.NamespaceAll
}
kubeClient, err := utils.KubeClient()
kubecfg, _ := utils.KubeConfig()
crdclient, err := clientset.NewForConfig(kubecfg)
kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace))
// New creates a new federated learning job controller that keeps the relevant pods
// in sync with their corresponding FederatedLearningJob objects.
func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
cfg := cc.Config


podInformer := kubeInformerFactory.Core().V1().Pods()
podInformer := cc.KubeInformerFactory.Core().V1().Pods()


jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace))
jobInformer := jobInformerFactory.Sedna().V1alpha1().FederatedLearningJobs()
jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().FederatedLearningJobs()


eventBroadcaster := record.NewBroadcaster() eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})


fc := &FederatedController{
kubeClient: kubeClient,
client: crdclient.SednaV1alpha1(),
fc := &Controller{
kubeClient: cc.KubeClient,
client: cc.SednaClient.SednaV1alpha1(),


queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "flJob"),
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "flJob-controller"}),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name),
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: Name + "-controller"}),
cfg: cfg, cfg: cfg,
} }


jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) { AddFunc: func(obj interface{}) {
fc.enqueueController(obj, true) fc.enqueueController(obj, true)

// when a federated learning job is added,
// send it to edge's LC.
fc.syncToEdge(watch.Added, obj)
}, },
UpdateFunc: func(old, cur interface{}) { UpdateFunc: func(old, cur interface{}) {
fc.enqueueController(cur, true) fc.enqueueController(cur, true)

// when a federated learning job is updated,
// send it to edge's LC as Added event.
fc.syncToEdge(watch.Added, cur)
}, },
DeleteFunc: func(obj interface{}) { DeleteFunc: func(obj interface{}) {
fc.enqueueController(obj, true) fc.enqueueController(obj, true)

// when a federated learning job is deleted,
// send it to edge's LC.
fc.syncToEdge(watch.Deleted, obj)
}, },
}) })

fc.jobLister = jobInformer.Lister() fc.jobLister = jobInformer.Lister()
fc.jobStoreSynced = jobInformer.Informer().HasSynced fc.jobStoreSynced = jobInformer.Informer().HasSynced


@@ -581,8 +590,5 @@ func NewFederatedController(cfg *config.ControllerConfig) (FeatureControllerI, e
fc.podStore = podInformer.Lister() fc.podStore = podInformer.Lister()
fc.podStoreSynced = podInformer.Informer().HasSynced fc.podStoreSynced = podInformer.Informer().HasSynced


stopCh := make(chan struct{})
kubeInformerFactory.Start(stopCh)
jobInformerFactory.Start(stopCh)
return fc, err
return fc, nil
} }

+ 123
- 0
pkg/globalmanager/controllers/federatedlearning/upstream.go View File

@@ -0,0 +1,123 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package federatedlearning

import (
"context"
"encoding/json"
"fmt"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func (c *Controller) updateModelMetrics(jobName, namespace string, metrics []sednav1.Metric) error {
var err error
job, err := c.client.FederatedLearningJobs(namespace).Get(context.TODO(), jobName, metav1.GetOptions{})
if err != nil {
// federated crd not found
return err
}
modelName := job.Spec.AggregationWorker.Model.Name
client := c.client.Models(namespace)

return runtime.RetryUpdateStatus(modelName, namespace, (func() error {
model, err := client.Get(context.TODO(), modelName, metav1.GetOptions{})
if err != nil {
return err
}

now := metav1.Now()
model.Status.UpdateTime = &now
model.Status.Metrics = metrics
_, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{})
return err
}))
}

func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error {
client := c.client.FederatedLearningJobs(namespace)

return runtime.RetryUpdateStatus(name, namespace, (func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
}))
}

// updateFromEdge updates the federated job's status
func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) (err error) {
// JobInfo defines the job information
type JobInfo struct {
// Current training round
CurrentRound int `json:"currentRound"`
UpdateTime string `json:"updateTime"`
}

// Output defines job output information
type Output struct {
Models []runtime.Model `json:"models"`
JobInfo *JobInfo `json:"ownerInfo"`
}

var status struct {
Phase string `json:"phase"`
Status string `json:"status"`
Output *Output `json:"output"`
}

err = json.Unmarshal(content, &status)
if err != nil {
return
}

output := status.Output

if output != nil {
// Update the model's metrics
if len(output.Models) > 0 {
// only one model
model := output.Models[0]
metrics := runtime.ConvertMapToMetrics(model.Metrics)
if len(metrics) > 0 {
c.updateModelMetrics(name, namespace, metrics)
}
}

jobInfo := output.JobInfo
// update job info if having any info
if jobInfo != nil && jobInfo.CurrentRound > 0 {
// Find a good place to save the progress info
// TODO: more meaningful reason/message
reason := "DoTraining"
message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime)
cond := NewJobCondition(sednav1.FLJobCondTraining, reason, message)
c.appendStatusCondition(name, namespace, cond)
}
}

return nil
}

func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error {
return addFunc(KindName, c.updateFromEdge)
}

+ 145
- 0
pkg/globalmanager/controllers/incrementallearning/downstream.go View File

@@ -0,0 +1,145 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package incrementallearning

import (
"context"
"fmt"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/klog/v2"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

// syncModelWithName will sync the model to the specified node.
// Now called when creating the incrementaljob.
func (c *Controller) syncModelWithName(nodeName, modelName, namespace string) error {
model, err := c.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{})
if err != nil {
// TODO: maybe use err.ErrStatus.Code == 404
return fmt.Errorf("model(%s/%s) not found", namespace, modelName)
}

// Since model.Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
if len(model.Kind) == 0 {
model.Kind = "Model"
}

runtime.InjectSecretAnnotations(c.kubeClient, model, model.Spec.CredentialName)

c.sendToEdgeFunc(nodeName, watch.Added, model)
return nil
}

func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error {
job, ok := obj.(*sednav1.IncrementalLearningJob)
if !ok {
return nil
}

// Since Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
job.Kind = KindName

jobConditions := job.Status.Conditions
if len(jobConditions) == 0 {
return nil
}

dataName := job.Spec.Dataset.Name
ds, err := c.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName)
}
// LC has dataset object on this node that may call dataset node
dsNodeName := ds.Spec.NodeName

var trainNodeName string
var evalNodeName string

ann := job.GetAnnotations()
if ann != nil {
trainNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobTrain)]
evalNodeName = ann[runtime.AnnotationsKeyPrefix+string(sednav1.ILJobEval)]
}

if eventType == watch.Deleted {
// delete jobs from all LCs
for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} {
if v != "" {
c.sendToEdgeFunc(v, eventType, job)
}
}
return nil
}

latestCondition := jobConditions[len(jobConditions)-1]
currentType := latestCondition.Type
jobStage := latestCondition.Stage

syncModelWithName := func(modelName string) {
if err := c.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil {
klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v",
modelName, job.Name, dsNodeName, err)
}
}

syncJobWithNodeName := func(nodeName string) {
if err := c.sendToEdgeFunc(nodeName, eventType, job); err != nil {
klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v",
job.Name, nodeName, jobStage, err)
}
}

runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName)

doJobStageEvent := func(modelName string, nodeName string) {
if currentType == sednav1.ILJobStageCondWaiting {
syncJobWithNodeName(dsNodeName)
syncModelWithName(modelName)
} else if currentType == sednav1.ILJobStageCondRunning {
if nodeName != "" {
syncJobWithNodeName(nodeName)
}
} else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed {
if nodeName != dsNodeName {
// delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed.
c.sendToEdgeFunc(nodeName, watch.Deleted, job)
}
}
}

switch jobStage {
case sednav1.ILJobTrain:
doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName)
case sednav1.ILJobEval:
doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName)
}

return nil
}

func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error {
c.sendToEdgeFunc = f
return nil
}

pkg/globalmanager/incrementallearningjob.go → pkg/globalmanager/controllers/incrementallearning/incrementallearningjob.go View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package incrementallearning


import ( import (
"context" "context"
@@ -30,9 +30,8 @@ import (
utilrand "k8s.io/apimachinery/pkg/util/rand" utilrand "k8s.io/apimachinery/pkg/util/rand"
utilruntime "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1" v1core "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1" corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/cache"
@@ -42,28 +41,33 @@ import (
k8scontroller "k8s.io/kubernetes/pkg/controller" k8scontroller "k8s.io/kubernetes/pkg/controller"


sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned"
sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions"
sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/config"
messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
) )


// ijControllerKind contains the schema.GroupVersionKind for this controller type.
var ijControllerKind = sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob")
const (
// Name is this controller name
Name = "IncrementalLearning"


// IncrementalJobController ensures that all IncrementalLearningJob objects have corresponding pods to
// KindName is the kind name of CR this controller controls
KindName = "IncrementalLearningJob"
)

// Kind contains the schema.GroupVersionKind for this controller type.
var Kind = sednav1.SchemeGroupVersion.WithKind(KindName)

// Controller ensures that all IncrementalLearningJob objects have corresponding pods to
// run their configured workload. // run their configured workload.
type IncrementalJobController struct {
type Controller struct {
kubeClient kubernetes.Interface kubeClient kubernetes.Interface
client sednaclientset.SednaV1alpha1Interface client sednaclientset.SednaV1alpha1Interface


// podStoreSynced returns true if the pod store has been synced at least once. // podStoreSynced returns true if the pod store has been synced at least once.
// Added as a member to the struct to allow injection for testing. // Added as a member to the struct to allow injection for testing.
podStoreSynced cache.InformerSynced podStoreSynced cache.InformerSynced
// jobStoreSynced returns true if the incrementaljob store has been synced at least once.
// jobStoreSynced returns true if the job store has been synced at least once.
// Added as a member to the struct to allow injection for testing. // Added as a member to the struct to allow injection for testing.
jobStoreSynced cache.InformerSynced jobStoreSynced cache.InformerSynced


@@ -76,50 +80,49 @@ type IncrementalJobController struct {
// IncrementalLearningJobs that need to be updated // IncrementalLearningJobs that need to be updated
queue workqueue.RateLimitingInterface queue workqueue.RateLimitingInterface


recorder record.EventRecorder

cfg *config.ControllerConfig cfg *config.ControllerConfig

sendToEdgeFunc runtime.DownstreamSendFunc
} }


// Run the main goroutine responsible for watching and syncing jobs.
func (jc *IncrementalJobController) Start() error {
// Run starts the main goroutine responsible for watching and syncing jobs.
func (c *Controller) Run(stopCh <-chan struct{}) {
// TODO: make workers parameter
workers := 1 workers := 1
stopCh := messageContext.Done()


go func() {
defer utilruntime.HandleCrash()
defer jc.queue.ShutDown()
klog.Infof("Starting incrementallearning job controller")
defer klog.Infof("Shutting down incrementallearning job controller")
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()


if !cache.WaitForNamedCacheSync("incrementallearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) {
klog.Errorf("failed to wait for caches to sync")
klog.Infof("Starting %s controller", Name)
defer klog.Infof("Shutting down %s controller", Name)


return
}
klog.Infof("Starting incrementallearning job workers")
for i := 0; i < workers; i++ {
go wait.Until(jc.worker, time.Second, stopCh)
}
if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) {
klog.Errorf("failed to wait for %s caches to sync", Name)


<-stopCh
}()
return nil
return
}

klog.Infof("Starting %s job workers", Name)
for i := 0; i < workers; i++ {
go wait.Until(c.worker, time.Second, stopCh)
}

<-stopCh
} }


// enqueueByPod enqueues the jointInferenceService object of the specified pod. // enqueueByPod enqueues the jointInferenceService object of the specified pod.
func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) {
func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
controllerRef := metav1.GetControllerOf(pod) controllerRef := metav1.GetControllerOf(pod)


if controllerRef == nil { if controllerRef == nil {
return return
} }


if controllerRef.Kind != ijControllerKind.Kind {
if controllerRef.Kind != Kind.Kind {
return return
} }


service, err := jc.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name)
service, err := c.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name)
if err != nil { if err != nil {
return return
} }
@@ -128,27 +131,27 @@ func (jc *IncrementalJobController) enqueueByPod(pod *v1.Pod, immediate bool) {
return return
} }


jc.enqueueController(service, immediate)
c.enqueueController(service, immediate)
} }


// When a pod is created, enqueue the controller that manages it and update it's expectations. // When a pod is created, enqueue the controller that manages it and update it's expectations.
func (jc *IncrementalJobController) addPod(obj interface{}) {
func (c *Controller) addPod(obj interface{}) {
pod := obj.(*v1.Pod) pod := obj.(*v1.Pod)
if pod.DeletionTimestamp != nil { if pod.DeletionTimestamp != nil {
// on a restart of the controller, it's possible a new pod shows up in a state that // on a restart of the controller, it's possible a new pod shows up in a state that
// is already pending deletion. Prevent the pod from being a creation observation. // is already pending deletion. Prevent the pod from being a creation observation.
jc.deletePod(pod)
c.deletePod(pod)
return return
} }


// backoff to queue when PodFailed // backoff to queue when PodFailed
immediate := pod.Status.Phase != v1.PodFailed immediate := pod.Status.Phase != v1.PodFailed


jc.enqueueByPod(pod, immediate)
c.enqueueByPod(pod, immediate)
} }


// When a pod is updated, figure out what joint inference service manage it and wake them up. // When a pod is updated, figure out what joint inference service manage it and wake them up.
func (jc *IncrementalJobController) updatePod(old, cur interface{}) {
func (c *Controller) updatePod(old, cur interface{}) {
curPod := cur.(*v1.Pod) curPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod) oldPod := old.(*v1.Pod)


@@ -157,11 +160,11 @@ func (jc *IncrementalJobController) updatePod(old, cur interface{}) {
return return
} }


jc.addPod(curPod)
c.addPod(curPod)
} }


// deletePod enqueues the jointinferenceservice obj When a pod is deleted // deletePod enqueues the jointinferenceservice obj When a pod is deleted
func (jc *IncrementalJobController) deletePod(obj interface{}) {
func (c *Controller) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod) pod, ok := obj.(*v1.Pod)


// comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
@@ -182,13 +185,13 @@ func (jc *IncrementalJobController) deletePod(obj interface{}) {
return return
} }
} }
jc.enqueueByPod(pod, true)
c.enqueueByPod(pod, true)
} }


// obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item, // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item,
// immediate tells the controller to update the status right away, and should // immediate tells the controller to update the status right away, and should
// happen ONLY when there was a successful pod run. // happen ONLY when there was a successful pod run.
func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate bool) {
func (c *Controller) enqueueController(obj interface{}, immediate bool) {
key, err := k8scontroller.KeyFunc(obj) key, err := k8scontroller.KeyFunc(obj)
if err != nil { if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
@@ -197,36 +200,36 @@ func (jc *IncrementalJobController) enqueueController(obj interface{}, immediate


backoff := time.Duration(0) backoff := time.Duration(0)
if !immediate { if !immediate {
backoff = getBackoff(jc.queue, key)
backoff = runtime.GetBackoff(c.queue, key)
} }


jc.queue.AddAfter(key, backoff)
c.queue.AddAfter(key, backoff)
} }


// worker runs a worker thread that just dequeues items, processes them, and marks them done. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key. // It enforces that the syncHandler is never invoked concurrently with the same key.
func (jc *IncrementalJobController) worker() {
for jc.processNextWorkItem() {
func (c *Controller) worker() {
for c.processNextWorkItem() {
} }
} }


func (jc *IncrementalJobController) processNextWorkItem() bool {
key, quit := jc.queue.Get()
func (c *Controller) processNextWorkItem() bool {
key, quit := c.queue.Get()
if quit { if quit {
return false return false
} }
defer jc.queue.Done(key)
defer c.queue.Done(key)


forget, err := jc.sync(key.(string))
forget, err := c.sync(key.(string))
if err == nil { if err == nil {
if forget { if forget {
jc.queue.Forget(key)
c.queue.Forget(key)
} }
return true return true
} }


utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err)) utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err))
jc.queue.AddRateLimited(key)
c.queue.AddRateLimited(key)


return true return true
} }
@@ -234,7 +237,7 @@ func (jc *IncrementalJobController) processNextWorkItem() bool {
// sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key. // concurrently with the same key.
func (jc *IncrementalJobController) sync(key string) (bool, error) {
func (c *Controller) sync(key string) (bool, error) {
startTime := time.Now() startTime := time.Now()
defer func() { defer func() {
klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime)) klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime))
@@ -247,7 +250,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) {
if len(ns) == 0 || len(name) == 0 { if len(ns) == 0 || len(name) == 0 {
return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key) return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key)
} }
sharedIncrementalJob, err := jc.jobLister.IncrementalLearningJobs(ns).Get(name)

sharedJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name)
if err != nil { if err != nil {
if errors.IsNotFound(err) { if errors.IsNotFound(err) {
klog.V(4).Infof("incrementallearning job has been deleted: %v", key) klog.V(4).Infof("incrementallearning job has been deleted: %v", key)
@@ -255,19 +259,21 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) {
} }
return false, err return false, err
} }
incrementaljob := *sharedIncrementalJob
// set kind for incrementaljob in case that the kind is None
incrementaljob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob"))
// incrementaljob first start, create pod for inference
if incrementaljob.Status.StartTime == nil {

job := *sharedJob
// set kind in case that the kind is None
job.SetGroupVersionKind(Kind)

// when job is handled at first, create pod for inference
if job.Status.StartTime == nil {
now := metav1.Now() now := metav1.Now()
incrementaljob.Status.StartTime = &now
pod := jc.getSpecifiedPods(&incrementaljob, InferencePodType)
job.Status.StartTime = &now
pod := c.getSpecifiedPods(&job, runtime.InferencePodType)
if pod == nil { if pod == nil {
err = jc.createInferPod(&incrementaljob)
err = c.createInferPod(&job)
} else { } else {
if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending { if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending {
err = jc.createInferPod(&incrementaljob)
err = c.createInferPod(&job)
} }
} }
if err != nil { if err != nil {
@@ -275,8 +281,8 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) {
} }
} }


// if incrementaljob was finished previously, we don't want to redo the termination
if IsIncrementalJobFinished(&incrementaljob) {
// if job was finished previously, we don't want to redo the termination
if IsJobFinished(&job) {
return true, nil return true, nil
} }


@@ -284,20 +290,20 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) {
jobFailed := false jobFailed := false
needUpdated := false needUpdated := false


// update conditions of incremental job
needUpdated, err = jc.updateIncrementalJobConditions(&incrementaljob)
// transit this job's state machine
needUpdated, err = c.transitJobState(&job)
if err != nil { if err != nil {
klog.V(2).Infof("incrementallearning job %v/%v faied to be updated, err:%s", incrementaljob.Namespace, incrementaljob.Name, err)
klog.V(2).Infof("incrementallearning job %v/%v failed to be updated, err:%s", job.Namespace, job.Name, err)
} }


if needUpdated { if needUpdated {
if err := jc.updateIncrementalJobStatus(&incrementaljob); err != nil {
if err := c.updateJobStatus(&job); err != nil {
return forget, err return forget, err
} }


if jobFailed && !IsIncrementalJobFinished(&incrementaljob) {
// returning an error will re-enqueue IncrementalJob after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for incrementaljob key %q", key)
if jobFailed && !IsJobFinished(&job) {
// returning an error will re-enqueue IncrementalLearningJob after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for incrementallearning job key %q", key)
} }


forget = true forget = true
@@ -308,65 +314,60 @@ func (jc *IncrementalJobController) sync(key string) (bool, error) {


// setWorkerNodeNameOfJob sets the worker nodeName of the specified job // setWorkerNodeNameOfJob sets the worker nodeName of the specified job
// which is used for downstream to sync job info to the specified LC located in nodeName. // which is used for downstream to sync job info to the specified LC located in nodeName.
func (jc *IncrementalJobController) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error {
key := AnnotationsKeyPrefix + jobStage
func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error {
key := runtime.AnnotationsKeyPrefix + jobStage


ann := job.GetAnnotations() ann := job.GetAnnotations()
if ann != nil {
if ann[key] == nodeName {
// already set
return nil
}
if ann[key] == nodeName {
// already set
return nil
} }
dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName)


jobClient := jc.client.IncrementalLearningJobs(job.Namespace)
var err error
for i := 0; i <= ResourceUpdateRetries; i++ {
var newJob *sednav1.IncrementalLearningJob
newJob, err = jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
jobClient := c.client.IncrementalLearningJobs(job.Namespace)
return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
if err != nil { if err != nil {
break
return err
} }


annotations := newJob.GetAnnotations() annotations := newJob.GetAnnotations()
if annotations != nil {
if annotations[key] == nodeName {
return nil
}
}

dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName)
if _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}); err == nil {
break
if annotations[key] == nodeName {
return nil
} }
}


return err
_, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{})
return err
})
} }


// updateIncrementalJobConditions ensures that conditions of incrementallearning job can be changed by podstatus
func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) {
// transitJobState transit job to next state
func (c *Controller) transitJobState(job *sednav1.IncrementalLearningJob) (bool, error) {
var initialType sednav1.ILJobStageConditionType var initialType sednav1.ILJobStageConditionType
var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{ var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{
Stage: sednav1.ILJobTrain, Stage: sednav1.ILJobTrain,
Type: initialType, Type: initialType,
} }

var newConditionType sednav1.ILJobStageConditionType var newConditionType sednav1.ILJobStageConditionType
var needUpdated = false var needUpdated = false
jobConditions := incrementaljob.Status.Conditions
var podStatus v1.PodPhase = v1.PodUnknown var podStatus v1.PodPhase = v1.PodUnknown
var pod *v1.Pod var pod *v1.Pod

jobConditions := job.Status.Conditions
if len(jobConditions) > 0 { if len(jobConditions) > 0 {
// get latest pod and pod status // get latest pod and pod status
latestCondition = (jobConditions)[len(jobConditions)-1] latestCondition = (jobConditions)[len(jobConditions)-1]
klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", incrementaljob.Namespace, incrementaljob.Name,
klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", job.Namespace, job.Name,
latestCondition.Stage) latestCondition.Stage)
pod = jc.getSpecifiedPods(incrementaljob, string(latestCondition.Stage))
pod = c.getSpecifiedPods(job, string(latestCondition.Stage))


if pod != nil { if pod != nil {
podStatus = pod.Status.Phase podStatus = pod.Status.Phase
} }
} }

jobStage := latestCondition.Stage jobStage := latestCondition.Stage
currentType := latestCondition.Type currentType := latestCondition.Type
newConditionType = currentType newConditionType = currentType
@@ -383,14 +384,14 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo
// include train, eval, deploy pod // include train, eval, deploy pod
var err error var err error
if jobStage == sednav1.ILJobDeploy { if jobStage == sednav1.ILJobDeploy {
err = jc.restartInferPod(incrementaljob)
err = c.restartInferPod(job)
if err != nil { if err != nil {
klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", incrementaljob.Namespace, incrementaljob.Name, err)
klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err)
} else { } else {
klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", incrementaljob.Namespace, incrementaljob.Name)
klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name)
} }
} else if podStatus != v1.PodPending && podStatus != v1.PodRunning { } else if podStatus != v1.PodPending && podStatus != v1.PodRunning {
err = jc.createPod(incrementaljob, jobStage)
err = c.createPod(job, jobStage)
} }
if err != nil { if err != nil {
return needUpdated, err return needUpdated, err
@@ -406,17 +407,17 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo
newConditionType = sednav1.ILJobStageCondRunning newConditionType = sednav1.ILJobStageCondRunning


// add nodeName to job // add nodeName to job
if err := jc.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil {
if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil {
return needUpdated, err return needUpdated, err
} }
} }
} else if podStatus == v1.PodSucceeded { } else if podStatus == v1.PodSucceeded {
// watch pod status, if pod completed, set type completed // watch pod status, if pod completed, set type completed
newConditionType = sednav1.ILJobStageCondCompleted newConditionType = sednav1.ILJobStageCondCompleted
klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", incrementaljob.Namespace, incrementaljob.Name, jobStage)
klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage)
} else if podStatus == v1.PodFailed { } else if podStatus == v1.PodFailed {
newConditionType = sednav1.ILJobStageCondFailed newConditionType = sednav1.ILJobStageCondFailed
klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", incrementaljob.Namespace, incrementaljob.Name, jobStage)
klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage)
} }
case sednav1.ILJobStageCondCompleted: case sednav1.ILJobStageCondCompleted:
jobStage = getNextStage(jobStage) jobStage = getNextStage(jobStage)
@@ -429,31 +430,29 @@ func (jc *IncrementalJobController) updateIncrementalJobConditions(incrementaljo
default: default:
// do nothing when given other type out of cases // do nothing when given other type out of cases
} }
klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", incrementaljob.Namespace, incrementaljob.Name, jobConditions)

klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions)
if latestCondition.Type != newConditionType { if latestCondition.Type != newConditionType {
incrementaljob.Status.Conditions = append(incrementaljob.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage))
job.Status.Conditions = append(job.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage))
needUpdated = true needUpdated = true
return needUpdated, nil
} }

return needUpdated, nil return needUpdated, nil
} }


// updateIncrementalJobStatus ensures that jobstatus can be updated rightly
func (jc *IncrementalJobController) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error {
jobClient := jc.client.IncrementalLearningJobs(incrementaljob.Namespace)
var err error
for i := 0; i <= ResourceUpdateRetries; i++ {
var newIncrementalJob *sednav1.IncrementalLearningJob
newIncrementalJob, err = jobClient.Get(context.TODO(), incrementaljob.Name, metav1.GetOptions{})
// updateJobStatus ensures that job status can be updated rightly
func (c *Controller) updateJobStatus(job *sednav1.IncrementalLearningJob) error {
jobClient := c.client.IncrementalLearningJobs(job.Namespace)
return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
if err != nil { if err != nil {
break
return err
} }
newIncrementalJob.Status = incrementaljob.Status
if _, err = jobClient.UpdateStatus(context.TODO(), newIncrementalJob, metav1.UpdateOptions{}); err == nil {
break
}
}
return err

newJob.Status = job.Status
_, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{})
return err
})
} }


func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition { func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition {
@@ -468,26 +467,29 @@ func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, j
} }
} }


func (jc *IncrementalJobController) generatePodName(jobName string, workerType string) string {
func (c *Controller) generatePodName(jobName string, workerType string) string {
return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5)
} }


func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod {
if podType == "Deploy" {
podType = InferencePodType
}
func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod {
var latestPod *v1.Pod var latestPod *v1.Pod
selector, _ := GenerateSelector(job)
pods, err := jc.podStore.Pods(job.Namespace).List(selector)
selector, _ := runtime.GenerateSelector(job)
pods, err := c.podStore.Pods(job.Namespace).List(selector)
if len(pods) == 0 || err != nil { if len(pods) == 0 || err != nil {
return nil return nil
} }

var matchTag = false var matchTag = false
latestPod = pods[0] latestPod = pods[0]

if podType == "Deploy" {
podType = runtime.InferencePodType
}

for _, pod := range pods { for _, pod := range pods {
s := strings.Split(pod.Name, "-") s := strings.Split(pod.Name, "-")
CurrentPodType := s[len(s)-2]
if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && CurrentPodType == strings.ToLower(podType) {
currentPodType := s[len(s)-2]
if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && currentPodType == strings.ToLower(podType) {
latestPod = pod latestPod = pod
matchTag = true matchTag = true
} }
@@ -498,20 +500,22 @@ func (jc *IncrementalJobController) getSpecifiedPods(job *sednav1.IncrementalLea
return latestPod return latestPod
} }


func (jc *IncrementalJobController) restartInferPod(job *sednav1.IncrementalLearningJob) error {
inferPod := jc.getSpecifiedPods(job, InferencePodType)
func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error {
inferPod := c.getSpecifiedPods(job, runtime.InferencePodType)
if inferPod == nil { if inferPod == nil {
klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name) klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name)
err := jc.createInferPod(job)
err := c.createInferPod(job)
return err return err
} }

ctx := context.Background() ctx := context.Background()
err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
if err != nil { if err != nil {
klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
return err return err
} }
err = jc.createInferPod(job)

err = c.createInferPod(job)
if err != nil { if err != nil {
klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
return err return err
@@ -532,14 +536,14 @@ func getNextStage(currentStage sednav1.ILJobStage) sednav1.ILJobStage {
} }
} }


func IsIncrementalJobFinished(j *sednav1.IncrementalLearningJob) bool {
func IsJobFinished(j *sednav1.IncrementalLearningJob) bool {
// TODO // TODO
return false return false
} }


func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
if name != "" { if name != "" {
secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
if err != nil { if err != nil {
err = fmt.Errorf("failed to get the secret %s for %s: %w", err = fmt.Errorf("failed to get the secret %s for %s: %w",
name, name,
@@ -549,7 +553,7 @@ func (jc *IncrementalJobController) getSecret(namespace, name string, ownerStr s
return return
} }


func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) {
func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) {
ctx := context.Background() ctx := context.Background()
var podTemplate *v1.PodTemplateSpec var podTemplate *v1.PodTemplateSpec


@@ -558,25 +562,25 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
deployModelName := job.Spec.DeploySpec.Model.Name deployModelName := job.Spec.DeploySpec.Model.Name


// check initial model name // check initial model name
initialModel, err := jc.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{})
initialModel, err := c.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get initial model %s: %w", return fmt.Errorf("failed to get initial model %s: %w",
initialModelName, err) initialModelName, err)
} }


_, err = jc.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{})
_, err = c.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get deploy model %s: %w", return fmt.Errorf("failed to get deploy model %s: %w",
deployModelName, err) deployModelName, err)
} }


dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{})
dataset, err := c.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get dataset %s: %w", return fmt.Errorf("failed to get dataset %s: %w",
incrementalDatasetName, err) incrementalDatasetName, err)
} }


datasetSecret, err := jc.getSecret(
datasetSecret, err := c.getSecret(
job.Namespace, job.Namespace,
dataset.Spec.CredentialName, dataset.Spec.CredentialName,
fmt.Sprintf("dataset %s", dataset.Name), fmt.Sprintf("dataset %s", dataset.Name),
@@ -585,7 +589,7 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
return err return err
} }


jobSecret, err := jc.getSecret(
jobSecret, err := c.getSecret(
job.Namespace, job.Namespace,
job.Spec.CredentialName, job.Spec.CredentialName,
fmt.Sprintf("incremental job %s", job.Name), fmt.Sprintf("incremental job %s", job.Name),
@@ -595,13 +599,14 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
} }


// get all url for train and eval from data in condition // get all url for train and eval from data in condition
var cond IncrementalCondData
condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data
klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr)
var cond IncrementalCondData
(&cond).Unmarshal([]byte(condDataStr)) (&cond).Unmarshal([]byte(condDataStr))
if cond.Input == nil { if cond.Input == nil {
return fmt.Errorf("empty input from condData") return fmt.Errorf("empty input from condData")
} }

dataURL := cond.Input.DataURL dataURL := cond.Input.DataURL
inputmodelURLs := cond.GetInputModelURLs() inputmodelURLs := cond.GetInputModelURLs()


@@ -614,25 +619,26 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
originalDataURLOrIndex = dataset.Spec.URL originalDataURLOrIndex = dataset.Spec.URL
} }


var workerParam *WorkerParam = new(WorkerParam)
var workerParam runtime.WorkerParam

if podtype == sednav1.ILJobTrain { if podtype == sednav1.ILJobTrain {
workerParam.workerType = TrainPodType
workerParam.WorkerType = runtime.TrainPodType


podTemplate = &job.Spec.TrainSpec.Template podTemplate = &job.Spec.TrainSpec.Template
// Env parameters for train


workerParam.env = map[string]string{
// Env parameters for train
workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
"WORKER_NAME": "train-worker-" + utilrand.String(5), "WORKER_NAME": "train-worker-" + utilrand.String(5),


"LC_SERVER": jc.cfg.LC.Server,
"LC_SERVER": c.cfg.LC.Server,
} }


baseModelURL := inputmodelURLs[0] baseModelURL := inputmodelURLs[0]
var baseModelSecret *v1.Secret var baseModelSecret *v1.Secret
if baseModelURL == initialModel.Spec.URL { if baseModelURL == initialModel.Spec.URL {
baseModelSecret, err = jc.getSecret(
baseModelSecret, err = c.getSecret(
job.Namespace, job.Namespace,
initialModel.Spec.CredentialName, initialModel.Spec.CredentialName,
fmt.Sprintf("initial model %s", initialModelName), fmt.Sprintf("initial model %s", initialModelName),
@@ -644,17 +650,17 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
baseModelSecret = jobSecret baseModelSecret = jobSecret
} }


workerParam.mounts = append(workerParam.mounts,
WorkerMount{
URL: &MountURL{
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: baseModelURL, URL: baseModelURL,
Secret: baseModelSecret, Secret: baseModelSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
}, },
EnvName: "BASE_MODEL_URL", EnvName: "BASE_MODEL_URL",
}, },
WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: cond.Input.OutputDir, URL: cond.Input.OutputDir,
Secret: jobSecret, Secret: jobSecret,
DownloadByInitializer: false, DownloadByInitializer: false,
@@ -662,8 +668,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
EnvName: "MODEL_URL", EnvName: "MODEL_URL",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: dataURL, URL: dataURL,
DownloadByInitializer: true, DownloadByInitializer: true,
Secret: jobSecret, Secret: jobSecret,
@@ -672,8 +678,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
}, },


// see https://github.com/kubeedge/sedna/issues/35 // see https://github.com/kubeedge/sedna/issues/35
WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
Secret: datasetSecret, Secret: datasetSecret,
URL: originalDataURLOrIndex, URL: originalDataURLOrIndex,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -683,23 +689,23 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
}, },
) )
} else { } else {
// Configure eval worker's mounts and envs
podTemplate = &job.Spec.EvalSpec.Template podTemplate = &job.Spec.EvalSpec.Template
workerParam.workerType = "Eval"
workerParam.WorkerType = "Eval"


// Configure Env information for eval by initial WorkerParam
workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
"WORKER_NAME": "eval-worker-" + utilrand.String(5), "WORKER_NAME": "eval-worker-" + utilrand.String(5),


"LC_SERVER": jc.cfg.LC.Server,
"LC_SERVER": c.cfg.LC.Server,
} }


var modelMountURLs []MountURL
var modelMountURLs []runtime.MountURL
for _, url := range inputmodelURLs { for _, url := range inputmodelURLs {
var modelSecret *v1.Secret var modelSecret *v1.Secret
if url == initialModel.Spec.URL { if url == initialModel.Spec.URL {
modelSecret, err = jc.getSecret(
modelSecret, err = c.getSecret(
job.Namespace, job.Namespace,
initialModel.Spec.CredentialName, initialModel.Spec.CredentialName,
fmt.Sprintf("initial model %s", initialModelName), fmt.Sprintf("initial model %s", initialModelName),
@@ -711,21 +717,21 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
modelSecret = jobSecret modelSecret = jobSecret
} }


modelMountURLs = append(modelMountURLs, MountURL{
modelMountURLs = append(modelMountURLs, runtime.MountURL{
URL: url, URL: url,
Secret: modelSecret, Secret: modelSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
}) })
} }
workerParam.mounts = append(workerParam.mounts,
WorkerMount{
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URLs: modelMountURLs, URLs: modelMountURLs,
Name: "models", Name: "models",
EnvName: "MODEL_URLS", EnvName: "MODEL_URLS",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: dataURL, URL: dataURL,
Secret: datasetSecret, Secret: datasetSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -734,8 +740,8 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
EnvName: "TEST_DATASET_URL", EnvName: "TEST_DATASET_URL",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
Secret: datasetSecret, Secret: datasetSecret,
URL: originalDataURLOrIndex, URL: originalDataURLOrIndex,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -748,40 +754,38 @@ func (jc *IncrementalJobController) createPod(job *sednav1.IncrementalLearningJo
} }


// set the default policy instead of Always policy // set the default policy instead of Always policy
workerParam.restartPolicy = v1.RestartPolicyOnFailure
workerParam.hostNetwork = true
workerParam.RestartPolicy = v1.RestartPolicyOnFailure
workerParam.HostNetwork = true


// create pod based on podtype // create pod based on podtype
_, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam)
if err != nil {
return err
}
_, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, &workerParam)
return return
} }


func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearningJob) error {
func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error {
infermodelName := job.Spec.DeploySpec.Model.Name infermodelName := job.Spec.DeploySpec.Model.Name
inferModel, err := jc.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{})
inferModel, err := c.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get infer model %s: %w", return fmt.Errorf("failed to get infer model %s: %w",
infermodelName, err) infermodelName, err)
} }

inferModelURL := inferModel.Spec.URL inferModelURL := inferModel.Spec.URL


// Env parameters for edge
HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters) HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters)
HEMParameterString := string(HEMParameterJSON) HEMParameterString := string(HEMParameterJSON)


// Configure container mounting and Env information by initial WorkerParam
modelSecret, err := jc.getSecret(
modelSecret, err := c.getSecret(
job.Namespace, job.Namespace,
inferModel.Spec.CredentialName, inferModel.Spec.CredentialName,
fmt.Sprintf("model %s", inferModel.Name), fmt.Sprintf("model %s", inferModel.Name),
) )
var workerParam *WorkerParam = new(WorkerParam)
workerParam.mounts = append(workerParam.mounts,
WorkerMount{
URL: &MountURL{

// Configure inference worker's mounts and envs
var workerParam runtime.WorkerParam
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: inferModelURL, URL: inferModelURL,
Secret: modelSecret, Secret: modelSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -791,7 +795,7 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn
}, },
) )


workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
"WORKER_NAME": "inferworker-" + utilrand.String(5), "WORKER_NAME": "inferworker-" + utilrand.String(5),
@@ -799,71 +803,48 @@ func (jc *IncrementalJobController) createInferPod(job *sednav1.IncrementalLearn
"HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name, "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name,
"HEM_PARAMETERS": HEMParameterString, "HEM_PARAMETERS": HEMParameterString,


"LC_SERVER": jc.cfg.LC.Server,
"LC_SERVER": c.cfg.LC.Server,
} }


workerParam.workerType = InferencePodType
workerParam.hostNetwork = true
workerParam.WorkerType = runtime.InferencePodType
workerParam.HostNetwork = true


// create edge pod
_, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam)
// create the inference worker
_, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, &workerParam)
return err return err
} }


// GetName returns the name of the incrementallearning job controller
func (jc *IncrementalJobController) GetName() string {
return "IncrementalLearningJobController"
}

// NewIncrementalJobController creates a new IncrementalJob controller that keeps the relevant pods
// in sync with their corresponding IncrementalJob objects.
func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) {
namespace := cfg.Namespace
if namespace == "" {
namespace = metav1.NamespaceAll
}
kubeClient, err := utils.KubeClient()
if err != nil {
return nil, err
}

kubecfg, err := utils.KubeConfig()
if err != nil {
return nil, err
}
crdclient, err := clientset.NewForConfig(kubecfg)
if err != nil {
return nil, err
}

kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace))
// New creates a new incremental learning job controller that keeps the relevant pods
// in sync with the corresponding IncrementalLearningJob objects.
func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
podInformer := cc.KubeInformerFactory.Core().V1().Pods()


podInformer := kubeInformerFactory.Core().V1().Pods()

jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace))
jobInformer := jobInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs()
jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs()


eventBroadcaster := record.NewBroadcaster() eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})

jc := &Controller{
kubeClient: cc.KubeClient,
client: cc.SednaClient.SednaV1alpha1(),


jc := &IncrementalJobController{
kubeClient: kubeClient,
client: crdclient.SednaV1alpha1(),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name),


queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "incrementallearningjob"),
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}),
cfg: cfg,
cfg: cc.Config,
} }


jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) { AddFunc: func(obj interface{}) {
jc.enqueueController(obj, true) jc.enqueueController(obj, true)
jc.syncToEdge(watch.Added, obj)
}, },
UpdateFunc: func(old, cur interface{}) { UpdateFunc: func(old, cur interface{}) {
jc.enqueueController(cur, true) jc.enqueueController(cur, true)
jc.syncToEdge(watch.Added, cur)
}, },
DeleteFunc: func(obj interface{}) { DeleteFunc: func(obj interface{}) {
jc.enqueueController(obj, true) jc.enqueueController(obj, true)
jc.syncToEdge(watch.Deleted, obj)
}, },
}) })
jc.jobLister = jobInformer.Lister() jc.jobLister = jobInformer.Lister()
@@ -877,8 +858,5 @@ func NewIncrementalJobController(cfg *config.ControllerConfig) (FeatureControlle
jc.podStore = podInformer.Lister() jc.podStore = podInformer.Lister()
jc.podStoreSynced = podInformer.Informer().HasSynced jc.podStoreSynced = podInformer.Informer().HasSynced


stopCh := make(chan struct{})
kubeInformerFactory.Start(stopCh)
jobInformerFactory.Start(stopCh)
return jc, err
return jc, nil
} }

+ 162
- 0
pkg/globalmanager/controllers/incrementallearning/upstream.go View File

@@ -0,0 +1,162 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package incrementallearning

import (
"context"
"encoding/json"
"fmt"
"strings"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type Model = runtime.Model

// the data of this condition including the input/output to do the next step
type IncrementalCondData struct {
Input *struct {
// Only one model cases
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`

DataURL string `json:"dataURL,omitempty"`

// the data samples reference will be stored into this URL.
// The content of this url would be:
// # the first uncomment line means the directory
// s3://dataset/
// mnist/0.jpg
// mnist/1.jpg
DataIndexURL string `json:"dataIndexURL,omitempty"`

OutputDir string `json:"outputDir,omitempty"`
} `json:"input,omitempty"`

Output *struct {
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`
} `json:"output,omitempty"`
}

func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string {
var modelURLs []string
if model != nil {
modelURLs = append(modelURLs, model.GetURL())
} else {
for _, m := range models {
modelURLs = append(modelURLs, m.GetURL())
}
}
return modelURLs
}

func (cd *IncrementalCondData) GetInputModelURLs() []string {
return cd.joinModelURLs(cd.Input.Model, cd.Input.Models)
}

func (cd *IncrementalCondData) GetOutputModelURLs() []string {
return cd.joinModelURLs(cd.Output.Model, cd.Output.Models)
}

func (cd *IncrementalCondData) Unmarshal(data []byte) error {
return json.Unmarshal(data, cd)
}

func (cd IncrementalCondData) Marshal() ([]byte, error) {
return json.Marshal(cd)
}

func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error {
client := c.client.IncrementalLearningJobs(namespace)
return runtime.RetryUpdateStatus(name, namespace, (func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
}))
}

// updateFromEdge syncs the edge updates to k8s
func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error {
var jobStatus struct {
Phase string `json:"phase"`
Status string `json:"status"`
}

err := json.Unmarshal(content, &jobStatus)
if err != nil {
return err
}

// Get the condition data.
// Here unmarshal and marshal immediately to skip the unnecessary fields
var condData IncrementalCondData
err = json.Unmarshal(content, &condData)
if err != nil {
return err
}
condDataBytes, _ := json.Marshal(&condData)

cond := sednav1.ILJobCondition{
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Data: string(condDataBytes),
Message: "reported by lc",
}

switch strings.ToLower(jobStatus.Phase) {
case "train":
cond.Stage = sednav1.ILJobTrain
case "eval":
cond.Stage = sednav1.ILJobEval
case "deploy":
cond.Stage = sednav1.ILJobDeploy
default:
return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase)
}

switch strings.ToLower(jobStatus.Status) {
case "ready":
cond.Type = sednav1.ILJobStageCondReady
case "completed":
cond.Type = sednav1.ILJobStageCondCompleted
case "failed":
cond.Type = sednav1.ILJobStageCondFailed
case "waiting":
cond.Type = sednav1.ILJobStageCondWaiting
default:
return fmt.Errorf("invalid condition type: %v", jobStatus.Status)
}

err = c.appendStatusCondition(name, namespace, cond)
if err != nil {
return fmt.Errorf("failed to append condition, err:%+w", err)
}
return nil
}

func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error {
return addFunc(KindName, c.updateFromEdge)
}

+ 56
- 0
pkg/globalmanager/controllers/jointinference/downstream.go View File

@@ -0,0 +1,56 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package jointinference

import (
"fmt"

"k8s.io/apimachinery/pkg/watch"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error {
joint, ok := obj.(*sednav1.JointInferenceService)
if !ok {
return nil
}

// Since Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
joint.Kind = KindName

// Here only propagate to the nodes with non empty name
// FIXME: only the case that Spec.NodeName specified is support
nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}

if len(joint.Kind) == 0 {
joint.Kind = KindName
}
return c.sendToEdgeFunc(nodeName, eventType, joint)
}

func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error {
c.sendToEdgeFunc = f

return nil
}

pkg/globalmanager/jointinferenceservice.go → pkg/globalmanager/controllers/jointinference/jointinferenceservice.go View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package jointinference


import ( import (
"context" "context"
@@ -29,7 +29,7 @@ import (
utilrand "k8s.io/apimachinery/pkg/util/rand" utilrand "k8s.io/apimachinery/pkg/util/rand"
utilruntime "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1" v1core "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -41,26 +41,32 @@ import (
k8scontroller "k8s.io/kubernetes/pkg/controller" k8scontroller "k8s.io/kubernetes/pkg/controller"


sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned"
sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions"
sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/config"
messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

const (
// Name is this controller name
Name = "JointInference"

// KindName is the kind name of CR this controller controls
KindName = "JointInferenceService"
) )


const ( const (
jointInferenceForEdge = "Edge" jointInferenceForEdge = "Edge"
jointInferenceForCloud = "Cloud" jointInferenceForCloud = "Cloud"
bigModelPort = 5000
) )


// jointServiceControllerKind contains the schema.GroupVersionKind for this controller type.
var jointServiceControllerKind = sednav1.SchemeGroupVersion.WithKind("JointInferenceService")
// Kind contains the schema.GroupVersionKind for this controller type.
var Kind = sednav1.SchemeGroupVersion.WithKind(Name)


// JointInferenceServiceController ensures that all JointInferenceService objects
// Controller ensures that all JointInferenceService objects
// have corresponding pods to run their configured workload. // have corresponding pods to run their configured workload.
type JointInferenceServiceController struct {
type Controller struct {
kubeClient kubernetes.Interface kubeClient kubernetes.Interface
client sednaclientset.SednaV1alpha1Interface client sednaclientset.SednaV1alpha1Interface


@@ -69,7 +75,7 @@ type JointInferenceServiceController struct {
// A store of pods // A store of pods
podStore corelisters.PodLister podStore corelisters.PodLister


// serviceStoreSynced returns true if the jointinferenceservice store has been synced at least once.
// serviceStoreSynced returns true if the JointInferenceService store has been synced at least once.
serviceStoreSynced cache.InformerSynced serviceStoreSynced cache.InformerSynced
// A store of service // A store of service
serviceLister sednav1listers.JointInferenceServiceLister serviceLister sednav1listers.JointInferenceServiceLister
@@ -80,48 +86,47 @@ type JointInferenceServiceController struct {
recorder record.EventRecorder recorder record.EventRecorder


cfg *config.ControllerConfig cfg *config.ControllerConfig

sendToEdgeFunc runtime.DownstreamSendFunc
} }


// Start starts the main goroutine responsible for watching and syncing services.
func (jc *JointInferenceServiceController) Start() error {
// Run starts the main goroutine responsible for watching and syncing services.
func (c *Controller) Run(stopCh <-chan struct{}) {
workers := 1 workers := 1
stopCh := messageContext.Done()


go func() {
defer utilruntime.HandleCrash()
defer jc.queue.ShutDown()
klog.Infof("Starting joint inference service controller")
defer klog.Infof("Shutting down joint inference service controller")
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()


if !cache.WaitForNamedCacheSync("jointinferenceservice", stopCh, jc.podStoreSynced, jc.serviceStoreSynced) {
klog.Errorf("failed to wait for joint inferce service caches to sync")
klog.Infof("Starting %s controller", Name)
defer klog.Infof("Shutting down %s controller", Name)


return
}
if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.serviceStoreSynced) {
klog.Errorf("failed to wait for %s caches to sync", Name)


klog.Infof("Starting joint inference service workers")
for i := 0; i < workers; i++ {
go wait.Until(jc.worker, time.Second, stopCh)
}
return
}


<-stopCh
}()
return nil
klog.Infof("Starting %s workers", Name)
for i := 0; i < workers; i++ {
go wait.Until(c.worker, time.Second, stopCh)
}

<-stopCh
} }


// enqueueByPod enqueues the jointInferenceService object of the specified pod.
func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate bool) {
// enqueueByPod enqueues the JointInferenceService object of the specified pod.
func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
controllerRef := metav1.GetControllerOf(pod) controllerRef := metav1.GetControllerOf(pod)


if controllerRef == nil { if controllerRef == nil {
return return
} }


if controllerRef.Kind != jointServiceControllerKind.Kind {
if controllerRef.Kind != Kind.Kind {
return return
} }


service, err := jc.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name)
service, err := c.serviceLister.JointInferenceServices(pod.Namespace).Get(controllerRef.Name)
if err != nil { if err != nil {
return return
} }
@@ -130,27 +135,27 @@ func (jc *JointInferenceServiceController) enqueueByPod(pod *v1.Pod, immediate b
return return
} }


jc.enqueueController(service, immediate)
c.enqueueController(service, immediate)
} }


// When a pod is created, enqueue the controller that manages it and update it's expectations. // When a pod is created, enqueue the controller that manages it and update it's expectations.
func (jc *JointInferenceServiceController) addPod(obj interface{}) {
func (c *Controller) addPod(obj interface{}) {
pod := obj.(*v1.Pod) pod := obj.(*v1.Pod)
if pod.DeletionTimestamp != nil { if pod.DeletionTimestamp != nil {
// on a restart of the controller, it's possible a new pod shows up in a state that // on a restart of the controller, it's possible a new pod shows up in a state that
// is already pending deletion. Prevent the pod from being a creation observation. // is already pending deletion. Prevent the pod from being a creation observation.
jc.deletePod(pod)
c.deletePod(pod)
return return
} }


// backoff to queue when PodFailed // backoff to queue when PodFailed
immediate := pod.Status.Phase != v1.PodFailed immediate := pod.Status.Phase != v1.PodFailed


jc.enqueueByPod(pod, immediate)
c.enqueueByPod(pod, immediate)
} }


// When a pod is updated, figure out what joint inference service manage it and wake them up. // When a pod is updated, figure out what joint inference service manage it and wake them up.
func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) {
func (c *Controller) updatePod(old, cur interface{}) {
curPod := cur.(*v1.Pod) curPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod) oldPod := old.(*v1.Pod)


@@ -159,11 +164,11 @@ func (jc *JointInferenceServiceController) updatePod(old, cur interface{}) {
return return
} }


jc.addPod(curPod)
c.addPod(curPod)
} }


// deletePod enqueues the jointinferenceservice obj When a pod is deleted
func (jc *JointInferenceServiceController) deletePod(obj interface{}) {
// deletePod enqueues the JointinferenceService obj When a pod is deleted
func (c *Controller) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod) pod, ok := obj.(*v1.Pod)


// comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
@@ -171,7 +176,7 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) {
// When a delete is dropped, the relist will notice a pod in the store not // When a delete is dropped, the relist will notice a pod in the store not
// in the list, leading to the insertion of a tombstone object which contains // in the list, leading to the insertion of a tombstone object which contains
// the deleted key/value. Note that this value might be stale. If the pod // the deleted key/value. Note that this value might be stale. If the pod
// changed labels the new jointinferenceservice will not be woken up till the periodic resync.
// changed labels the new JointInferenceService will not be woken up till the periodic resync.
if !ok { if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown) tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok { if !ok {
@@ -184,13 +189,13 @@ func (jc *JointInferenceServiceController) deletePod(obj interface{}) {
return return
} }
} }
jc.enqueueByPod(pod, true)
c.enqueueByPod(pod, true)
} }


// obj could be an *sednav1.JointInferenceService, or a DeletionFinalStateUnknown marker item, // obj could be an *sednav1.JointInferenceService, or a DeletionFinalStateUnknown marker item,
// immediate tells the controller to update the status right away, and should // immediate tells the controller to update the status right away, and should
// happen ONLY when there was a successful pod run. // happen ONLY when there was a successful pod run.
func (jc *JointInferenceServiceController) enqueueController(obj interface{}, immediate bool) {
func (c *Controller) enqueueController(obj interface{}, immediate bool) {
key, err := k8scontroller.KeyFunc(obj) key, err := k8scontroller.KeyFunc(obj)
if err != nil { if err != nil {
klog.Warningf("Couldn't get key for object %+v: %v", obj, err) klog.Warningf("Couldn't get key for object %+v: %v", obj, err)
@@ -199,42 +204,42 @@ func (jc *JointInferenceServiceController) enqueueController(obj interface{}, im


backoff := time.Duration(0) backoff := time.Duration(0)
if !immediate { if !immediate {
backoff = getBackoff(jc.queue, key)
backoff = runtime.GetBackoff(c.queue, key)
} }
jc.queue.AddAfter(key, backoff)
c.queue.AddAfter(key, backoff)
} }


// worker runs a worker thread that just dequeues items, processes them, and marks them done. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the sync is never invoked concurrently with the same key. // It enforces that the sync is never invoked concurrently with the same key.
func (jc *JointInferenceServiceController) worker() {
for jc.processNextWorkItem() {
func (c *Controller) worker() {
for c.processNextWorkItem() {
} }
} }


func (jc *JointInferenceServiceController) processNextWorkItem() bool {
key, quit := jc.queue.Get()
func (c *Controller) processNextWorkItem() bool {
key, quit := c.queue.Get()
if quit { if quit {
return false return false
} }
defer jc.queue.Done(key)
defer c.queue.Done(key)


forget, err := jc.sync(key.(string))
forget, err := c.sync(key.(string))
if err == nil { if err == nil {
if forget { if forget {
jc.queue.Forget(key)
c.queue.Forget(key)
} }
return true return true
} }


klog.Warningf("Error syncing jointinference service: %v", err) klog.Warningf("Error syncing jointinference service: %v", err)
jc.queue.AddRateLimited(key)
c.queue.AddRateLimited(key)


return true return true
} }


// sync will sync the jointinferenceservice with the given key. // sync will sync the jointinferenceservice with the given key.
// This function is not meant to be invoked concurrently with the same key. // This function is not meant to be invoked concurrently with the same key.
func (jc *JointInferenceServiceController) sync(key string) (bool, error) {
func (c *Controller) sync(key string) (bool, error) {
startTime := time.Now() startTime := time.Now()
defer func() { defer func() {
klog.V(4).Infof("Finished syncing jointinference service %q (%v)", key, time.Since(startTime)) klog.V(4).Infof("Finished syncing jointinference service %q (%v)", key, time.Since(startTime))
@@ -247,7 +252,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) {
if len(ns) == 0 || len(name) == 0 { if len(ns) == 0 || len(name) == 0 {
return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key) return false, fmt.Errorf("invalid jointinference service key %q: either namespace or name is missing", key)
} }
sharedJointinferenceservice, err := jc.serviceLister.JointInferenceServices(ns).Get(name)
sharedService, err := c.serviceLister.JointInferenceServices(ns).Get(name)
if err != nil { if err != nil {
if errors.IsNotFound(err) { if errors.IsNotFound(err) {
klog.V(4).Infof("JointInferenceService has been deleted: %v", key) klog.V(4).Infof("JointInferenceService has been deleted: %v", key)
@@ -256,37 +261,38 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) {
return false, err return false, err
} }


jointinferenceservice := *sharedJointinferenceservice
service := *sharedService


// if jointinferenceservice was finished previously, we don't want to redo the termination
if isJointinferenceserviceFinished(&jointinferenceservice) {
// if service was finished previously, we don't want to redo the termination
if isServiceFinished(&service) {
return true, nil return true, nil
} }


// set kind for jointinferenceservice in case that the kind is None
// set kind for service in case that the kind is None
// more details at https://github.com/kubernetes/kubernetes/issues/3030 // more details at https://github.com/kubernetes/kubernetes/issues/3030
jointinferenceservice.SetGroupVersionKind(jointServiceControllerKind)
service.SetGroupVersionKind(Kind)


selector, _ := GenerateSelector(&jointinferenceservice)
pods, err := jc.podStore.Pods(jointinferenceservice.Namespace).List(selector)
selector, _ := runtime.GenerateSelector(&service)
pods, err := c.podStore.Pods(service.Namespace).List(selector)


if err != nil { if err != nil {
return false, err return false, err
} }


klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", jointinferenceservice.Namespace, jointinferenceservice.Name, len(pods), pods)
klog.V(4).Infof("list jointinference service %v/%v, %v pods: %v", service.Namespace, service.Name, len(pods), pods)


latestConditionLen := len(jointinferenceservice.Status.Conditions)
latestConditionLen := len(service.Status.Conditions)


active := calcActivePodCount(pods)
active := runtime.CalcActivePodCount(pods)
var failed int32 = 0 var failed int32 = 0

// neededCounts means that two pods should be created successfully in a jointinference service currently // neededCounts means that two pods should be created successfully in a jointinference service currently
// two pods consist of edge pod and cloud pod // two pods consist of edge pod and cloud pod
var neededCounts int32 = 2 var neededCounts int32 = 2
// jointinferenceservice first start
if jointinferenceservice.Status.StartTime == nil {
if service.Status.StartTime == nil {
now := metav1.Now() now := metav1.Now()
jointinferenceservice.Status.StartTime = &now
service.Status.StartTime = &now
} else { } else {
failed = neededCounts - active failed = neededCounts - active
} }
@@ -298,7 +304,7 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) {


// get the latest condition type // get the latest condition type
// based on that condition updated is appended, not inserted. // based on that condition updated is appended, not inserted.
jobConditions := jointinferenceservice.Status.Conditions
jobConditions := service.Status.Conditions
if len(jobConditions) > 0 { if len(jobConditions) > 0 {
latestConditionType = (jobConditions)[len(jobConditions)-1].Type latestConditionType = (jobConditions)[len(jobConditions)-1].Type
} }
@@ -311,12 +317,12 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) {
serviceFailed = true serviceFailed = true
// TODO: get the failed worker, and knows that which worker fails, edge inference worker or cloud inference worker // TODO: get the failed worker, and knows that which worker fails, edge inference worker or cloud inference worker
reason = "workerFailed" reason = "workerFailed"
message = "the worker of Jointinferenceservice failed"
message = "the worker of service failed"
newCondtionType = sednav1.JointInferenceServiceCondFailed newCondtionType = sednav1.JointInferenceServiceCondFailed
jc.recorder.Event(&jointinferenceservice, v1.EventTypeWarning, reason, message)
c.recorder.Event(&service, v1.EventTypeWarning, reason, message)
} else { } else {
if len(pods) == 0 { if len(pods) == 0 {
active, manageServiceErr = jc.createWorkers(&jointinferenceservice)
active, manageServiceErr = c.createWorkers(&service)
} }
if manageServiceErr != nil { if manageServiceErr != nil {
serviceFailed = true serviceFailed = true
@@ -331,20 +337,20 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) {


// //
if newCondtionType != latestConditionType { if newCondtionType != latestConditionType {
jointinferenceservice.Status.Conditions = append(jointinferenceservice.Status.Conditions, NewJointInferenceServiceCondition(newCondtionType, reason, message))
service.Status.Conditions = append(service.Status.Conditions, newServiceCondition(newCondtionType, reason, message))
} }
forget := false forget := false


// no need to update the jointinferenceservice if the status hasn't changed since last time // no need to update the jointinferenceservice if the status hasn't changed since last time
if jointinferenceservice.Status.Active != active || jointinferenceservice.Status.Failed != failed || len(jointinferenceservice.Status.Conditions) != latestConditionLen {
jointinferenceservice.Status.Active = active
jointinferenceservice.Status.Failed = failed
if service.Status.Active != active || service.Status.Failed != failed || len(service.Status.Conditions) != latestConditionLen {
service.Status.Active = active
service.Status.Failed = failed


if err := jc.updateStatus(&jointinferenceservice); err != nil {
if err := c.updateStatus(&service); err != nil {
return forget, err return forget, err
} }


if serviceFailed && !isJointinferenceserviceFinished(&jointinferenceservice) {
if serviceFailed && !isServiceFinished(&service) {
// returning an error will re-enqueue jointinferenceservice after the backoff period // returning an error will re-enqueue jointinferenceservice after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for jointinference service key %q", key) return forget, fmt.Errorf("failed pod(s) detected for jointinference service key %q", key)
} }
@@ -355,8 +361,8 @@ func (jc *JointInferenceServiceController) sync(key string) (bool, error) {
return forget, manageServiceErr return forget, manageServiceErr
} }


// NewJointInferenceServiceCondition creates a new joint condition
func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition {
// newServiceCondition creates a new joint condition
func newServiceCondition(conditionType sednav1.JointInferenceServiceConditionType, reason, message string) sednav1.JointInferenceServiceCondition {
return sednav1.JointInferenceServiceCondition{ return sednav1.JointInferenceServiceCondition{
Type: conditionType, Type: conditionType,
Status: v1.ConditionTrue, Status: v1.ConditionTrue,
@@ -367,24 +373,20 @@ func NewJointInferenceServiceCondition(conditionType sednav1.JointInferenceServi
} }
} }


func (jc *JointInferenceServiceController) updateStatus(jointinferenceservice *sednav1.JointInferenceService) error {
serviceClient := jc.client.JointInferenceServices(jointinferenceservice.Namespace)
var err error
for i := 0; i <= ResourceUpdateRetries; i = i + 1 {
var newJointinferenceservice *sednav1.JointInferenceService
newJointinferenceservice, err = serviceClient.Get(context.TODO(), jointinferenceservice.Name, metav1.GetOptions{})
func (c *Controller) updateStatus(service *sednav1.JointInferenceService) error {
client := c.client.JointInferenceServices(service.Namespace)
return runtime.RetryUpdateStatus(service.Name, service.Namespace, func() error {
newService, err := client.Get(context.TODO(), service.Name, metav1.GetOptions{})
if err != nil { if err != nil {
break
}
newJointinferenceservice.Status = jointinferenceservice.Status
if _, err = serviceClient.UpdateStatus(context.TODO(), newJointinferenceservice, metav1.UpdateOptions{}); err == nil {
break
return err
} }
}
return nil
newService.Status = service.Status
_, err = client.UpdateStatus(context.TODO(), newService, metav1.UpdateOptions{})
return err
})
} }


func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool {
func isServiceFinished(j *sednav1.JointInferenceService) bool {
for _, c := range j.Status.Conditions { for _, c := range j.Status.Conditions {
if (c.Type == sednav1.JointInferenceServiceCondFailed) && c.Status == v1.ConditionTrue { if (c.Type == sednav1.JointInferenceServiceCondFailed) && c.Status == v1.ConditionTrue {
return true return true
@@ -393,11 +395,11 @@ func isJointinferenceserviceFinished(j *sednav1.JointInferenceService) bool {
return false return false
} }


func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) {
func (c *Controller) createWorkers(service *sednav1.JointInferenceService) (active int32, err error) {
active = 0 active = 0


// create cloud worker // create cloud worker
err = jc.createCloudWorker(service)
err = c.createCloudWorker(service)
if err != nil { if err != nil {
return active, err return active, err
} }
@@ -406,14 +408,14 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI
// create k8s service for cloudPod // create k8s service for cloudPod
// FIXME(llhuii): only the case that Spec.NodeName specified is support, // FIXME(llhuii): only the case that Spec.NodeName specified is support,
// will support Spec.NodeSelector. // will support Spec.NodeSelector.
bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName)
bigServicePort, err := CreateKubernetesService(jc.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP)
bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName)
bigServicePort, err := runtime.CreateKubernetesService(c.kubeClient, service, jointInferenceForCloud, bigModelPort, bigModelIP)
if err != nil { if err != nil {
return active, err return active, err
} }


// create edge worker // create edge worker
err = jc.createEdgeWorker(service, bigServicePort)
err = c.createEdgeWorker(service, bigServicePort)
if err != nil { if err != nil {
return active, err return active, err
} }
@@ -422,24 +424,24 @@ func (jc *JointInferenceServiceController) createWorkers(service *sednav1.JointI
return active, err return active, err
} }


func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.JointInferenceService) error {
func (c *Controller) createCloudWorker(service *sednav1.JointInferenceService) error {
// deliver pod for cloudworker // deliver pod for cloudworker
cloudModelName := service.Spec.CloudWorker.Model.Name cloudModelName := service.Spec.CloudWorker.Model.Name
cloudModel, err := jc.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{})
cloudModel, err := c.client.Models(service.Namespace).Get(context.Background(), cloudModelName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get cloud model %s: %w", return fmt.Errorf("failed to get cloud model %s: %w",
cloudModelName, err) cloudModelName, err)
} }


var workerParam WorkerParam
var workerParam runtime.WorkerParam


secretName := cloudModel.Spec.CredentialName secretName := cloudModel.Spec.CredentialName
var modelSecret *v1.Secret var modelSecret *v1.Secret
if secretName != "" { if secretName != "" {
modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
} }
workerParam.mounts = append(workerParam.mounts, WorkerMount{
URL: &MountURL{
workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{
URL: &runtime.MountURL{
URL: cloudModel.Spec.URL, URL: cloudModel.Spec.URL,
Secret: modelSecret, Secret: modelSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -448,7 +450,7 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo
EnvName: "MODEL_URL", EnvName: "MODEL_URL",
}) })


workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": service.Namespace, "NAMESPACE": service.Namespace,
"SERVICE_NAME": service.Name, "SERVICE_NAME": service.Name,
"WORKER_NAME": "cloudworker-" + utilrand.String(5), "WORKER_NAME": "cloudworker-" + utilrand.String(5),
@@ -456,21 +458,21 @@ func (jc *JointInferenceServiceController) createCloudWorker(service *sednav1.Jo
"BIG_MODEL_BIND_PORT": strconv.Itoa(int(bigModelPort)), "BIG_MODEL_BIND_PORT": strconv.Itoa(int(bigModelPort)),
} }


workerParam.workerType = jointInferenceForCloud
workerParam.WorkerType = jointInferenceForCloud


// create cloud pod // create cloud pod
_, err = createPodWithTemplate(jc.kubeClient,
_, err = runtime.CreatePodWithTemplate(c.kubeClient,
service, service,
&service.Spec.CloudWorker.Template, &service.Spec.CloudWorker.Template,
&workerParam) &workerParam)
return err return err
} }


func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error {
func (c *Controller) createEdgeWorker(service *sednav1.JointInferenceService, bigServicePort int32) error {
// deliver pod for edgeworker // deliver pod for edgeworker
ctx := context.Background() ctx := context.Background()
edgeModelName := service.Spec.EdgeWorker.Model.Name edgeModelName := service.Spec.EdgeWorker.Model.Name
edgeModel, err := jc.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{})
edgeModel, err := c.client.Models(service.Namespace).Get(ctx, edgeModelName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get edge model %s: %w", return fmt.Errorf("failed to get edge model %s: %w",
edgeModelName, err) edgeModelName, err)
@@ -479,13 +481,13 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi
secretName := edgeModel.Spec.CredentialName secretName := edgeModel.Spec.CredentialName
var modelSecret *v1.Secret var modelSecret *v1.Secret
if secretName != "" { if secretName != "" {
modelSecret, _ = jc.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
modelSecret, _ = c.kubeClient.CoreV1().Secrets(service.Namespace).Get(context.TODO(), secretName, metav1.GetOptions{})
} }


// FIXME(llhuii): only the case that Spec.NodeName specified is support, // FIXME(llhuii): only the case that Spec.NodeName specified is support,
// will support Spec.NodeSelector. // will support Spec.NodeSelector.
// get bigModelIP from nodeName in cloudWorker // get bigModelIP from nodeName in cloudWorker
bigModelIP, err := GetNodeIPByName(jc.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName)
bigModelIP, err := runtime.GetNodeIPByName(c.kubeClient, service.Spec.CloudWorker.Template.Spec.NodeName)
if err != nil { if err != nil {
return fmt.Errorf("failed to get node ip: %w", err) return fmt.Errorf("failed to get node ip: %w", err)
} }
@@ -494,10 +496,10 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi
HEMParameterJSON, _ := json.Marshal(edgeWorker.HardExampleMining.Parameters) HEMParameterJSON, _ := json.Marshal(edgeWorker.HardExampleMining.Parameters)
HEMParameterString := string(HEMParameterJSON) HEMParameterString := string(HEMParameterJSON)


var workerParam WorkerParam
var workerParam runtime.WorkerParam


workerParam.mounts = append(workerParam.mounts, WorkerMount{
URL: &MountURL{
workerParam.Mounts = append(workerParam.Mounts, runtime.WorkerMount{
URL: &runtime.MountURL{
URL: edgeModel.Spec.URL, URL: edgeModel.Spec.URL,
Secret: modelSecret, Secret: modelSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -506,7 +508,7 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi
EnvName: "MODEL_URL", EnvName: "MODEL_URL",
}) })


workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": service.Namespace, "NAMESPACE": service.Namespace,
"SERVICE_NAME": service.Name, "SERVICE_NAME": service.Name,
"WORKER_NAME": "edgeworker-" + utilrand.String(5), "WORKER_NAME": "edgeworker-" + utilrand.String(5),
@@ -517,52 +519,37 @@ func (jc *JointInferenceServiceController) createEdgeWorker(service *sednav1.Joi
"HEM_NAME": edgeWorker.HardExampleMining.Name, "HEM_NAME": edgeWorker.HardExampleMining.Name,
"HEM_PARAMETERS": HEMParameterString, "HEM_PARAMETERS": HEMParameterString,


"LC_SERVER": jc.cfg.LC.Server,
"LC_SERVER": c.cfg.LC.Server,
} }


workerParam.workerType = jointInferenceForEdge
workerParam.hostNetwork = true
workerParam.WorkerType = jointInferenceForEdge
workerParam.HostNetwork = true


// create edge pod // create edge pod
_, err = createPodWithTemplate(jc.kubeClient,
_, err = runtime.CreatePodWithTemplate(c.kubeClient,
service, service,
&service.Spec.EdgeWorker.Template, &service.Spec.EdgeWorker.Template,
&workerParam) &workerParam)
return err return err
} }


// GetName returns the name of the joint inference controller
func (jc *JointInferenceServiceController) GetName() string {
return "JointInferenceServiceController"
}

// NewJointController creates a new JointInferenceService controller that keeps the relevant pods
// New creates a new JointInferenceService controller that keeps the relevant pods
// in sync with their corresponding JointInferenceService objects. // in sync with their corresponding JointInferenceService objects.
func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error) {
var err error
namespace := cfg.Namespace
if namespace == "" {
namespace = metav1.NamespaceAll
}

kubeClient, _ := utils.KubeClient()
kubecfg, _ := utils.KubeConfig()
crdclient, _ := clientset.NewForConfig(kubecfg)
kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace))
func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
cfg := cc.Config


podInformer := kubeInformerFactory.Core().V1().Pods()
podInformer := cc.KubeInformerFactory.Core().V1().Pods()


serviceInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace))
serviceInformer := serviceInformerFactory.Sedna().V1alpha1().JointInferenceServices()
serviceInformer := cc.SednaInformerFactory.Sedna().V1alpha1().JointInferenceServices()


eventBroadcaster := record.NewBroadcaster() eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})


jc := &JointInferenceServiceController{
kubeClient: kubeClient,
client: crdclient.SednaV1alpha1(),
jc := &Controller{
kubeClient: cc.KubeClient,
client: cc.SednaClient.SednaV1alpha1(),


queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "jointinferenceservice"),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "jointinferenceservice"),
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}), recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "jointinferenceservice-controller"}),
cfg: cfg, cfg: cfg,
} }
@@ -570,14 +557,17 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error
serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) { AddFunc: func(obj interface{}) {
jc.enqueueController(obj, true) jc.enqueueController(obj, true)
jc.syncToEdge(watch.Added, obj)
}, },


UpdateFunc: func(old, cur interface{}) { UpdateFunc: func(old, cur interface{}) {
jc.enqueueController(cur, true) jc.enqueueController(cur, true)
jc.syncToEdge(watch.Added, cur)
}, },


DeleteFunc: func(obj interface{}) { DeleteFunc: func(obj interface{}) {
jc.enqueueController(obj, true) jc.enqueueController(obj, true)
jc.syncToEdge(watch.Deleted, obj)
}, },
}) })


@@ -593,8 +583,5 @@ func NewJointController(cfg *config.ControllerConfig) (FeatureControllerI, error
jc.podStore = podInformer.Lister() jc.podStore = podInformer.Lister()
jc.podStoreSynced = podInformer.Informer().HasSynced jc.podStoreSynced = podInformer.Informer().HasSynced


stopCh := messageContext.Done()
kubeInformerFactory.Start(stopCh)
serviceInformerFactory.Start(stopCh)
return jc, err
return jc, nil
} }

+ 92
- 0
pkg/globalmanager/controllers/jointinference/upstream.go View File

@@ -0,0 +1,92 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package jointinference

import (
"context"
"encoding/json"
"fmt"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
)

func (c *Controller) updateMetrics(name, namespace string, metrics []sednav1.Metric) error {
client := c.client.JointInferenceServices(namespace)

return runtime.RetryUpdateStatus(name, namespace, func() error {
joint, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
joint.Status.Metrics = metrics
_, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{})
return err
})
}

// updateFromEdge syncs the edge updates to k8s
func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error {
// Output defines owner output information
type Output struct {
ServiceInfo map[string]interface{} `json:"ownerInfo"`
}

var status struct {
// Phase always should be "inference"
Phase string `json:"phase"`
Status string `json:"status"`
Output *Output `json:"output"`
}

err := json.Unmarshal(content, &status)
if err != nil {
return err
}

// TODO: propagate status.Status to k8s

output := status.Output
if output == nil || output.ServiceInfo == nil {
// no output info
klog.Warningf("empty status info for joint inference service %s/%s", namespace, name)
return nil
}

info := output.ServiceInfo

for _, ignoreTimeKey := range []string{
"startTime",
"updateTime",
} {
delete(info, ignoreTimeKey)
}

metrics := runtime.ConvertMapToMetrics(info)

err = c.updateMetrics(name, namespace, metrics)
if err != nil {
return fmt.Errorf("failed to update metrics, err:%+w", err)
}
return nil
}

func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error {
return addFunc(KindName, c.updateFromEdge)
}

+ 55
- 0
pkg/globalmanager/controllers/lifelonglearning/downstream.go View File

@@ -0,0 +1,55 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package lifelonglearning

import (
"fmt"

"k8s.io/apimachinery/pkg/watch"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error {
job, ok := obj.(*sednav1.LifelongLearningJob)
if !ok {
return nil
}

// Since Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
job.Kind = KindName

// Here only propagate to the nodes with non empty name

// FIXME(llhuii): only the case that all workers having the same nodeName are support,
// will support Spec.NodeSelector and differenect nodeName.
nodeName := job.Spec.TrainSpec.Template.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}

runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName)
return c.sendToEdgeFunc(nodeName, eventType, job)
}

func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error {
c.sendToEdgeFunc = f
return nil
}

pkg/globalmanager/lifelonglearningjob.go → pkg/globalmanager/controllers/lifelonglearning/lifelonglearningjob.go View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package lifelonglearning


import ( import (
"context" "context"
@@ -28,9 +28,8 @@ import (
utilrand "k8s.io/apimachinery/pkg/util/rand" utilrand "k8s.io/apimachinery/pkg/util/rand"
utilruntime "k8s.io/apimachinery/pkg/util/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1" v1core "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1" corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/cache"
@@ -40,21 +39,25 @@ import (
k8scontroller "k8s.io/kubernetes/pkg/controller" k8scontroller "k8s.io/kubernetes/pkg/controller"


sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned"
sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1" sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
informers "github.com/kubeedge/sedna/pkg/client/informers/externalversions"
sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1" sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config" "github.com/kubeedge/sedna/pkg/globalmanager/config"
messageContext "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
) )


// ljControllerKind contains the schema.GroupVersionKind for this controller type.
var ljControllerKind = sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob")
const (
// KindName is the kind name of CR this controller controls
KindName = "LifelongLearningJob"
// Name is this controller name
Name = "LifelongLearning"
)

// Kind contains the schema.GroupVersionKind for this controller type.
var Kind = sednav1.SchemeGroupVersion.WithKind(KindName)


// LifelongLearningJobController ensures that all LifelongLearningJob objects have corresponding pods to
// Controller ensures that all LifelongLearningJob objects have corresponding pods to
// run their configured workload. // run their configured workload.
type LifelongLearningJobController struct {
type Controller struct {
kubeClient kubernetes.Interface kubeClient kubernetes.Interface
client sednaclientset.SednaV1alpha1Interface client sednaclientset.SednaV1alpha1Interface


@@ -74,50 +77,47 @@ type LifelongLearningJobController struct {
// LifelongLearningJobs that need to be updated // LifelongLearningJobs that need to be updated
queue workqueue.RateLimitingInterface queue workqueue.RateLimitingInterface


recorder record.EventRecorder

cfg *config.ControllerConfig cfg *config.ControllerConfig

sendToEdgeFunc runtime.DownstreamSendFunc
} }


// Run the main goroutine responsible for watching and syncing jobs.
func (jc *LifelongLearningJobController) Start() error {
// Run starts the main goroutine responsible for watching and syncing jobs.
func (c *Controller) Run(stopCh <-chan struct{}) {
workers := 1 workers := 1
stopCh := messageContext.Done()


go func() {
defer utilruntime.HandleCrash()
defer jc.queue.ShutDown()
klog.Infof("Starting lifelonglearning job controller")
defer klog.Infof("Shutting down lifelonglearning job controller")
defer utilruntime.HandleCrash()
defer c.queue.ShutDown()


if !cache.WaitForNamedCacheSync("lifelonglearningjob", stopCh, jc.podStoreSynced, jc.jobStoreSynced) {
klog.Errorf("failed to wait for caches to sync")
klog.Infof("Starting %s controller", Name)
defer klog.Infof("Shutting down %s controller", Name)


return
}
klog.Infof("Starting lifelonglearning job workers")
for i := 0; i < workers; i++ {
go wait.Until(jc.worker, time.Second, stopCh)
}
if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) {
klog.Errorf("failed to wait for %s caches to sync", Name)


<-stopCh
}()
return nil
return
}
klog.Infof("Starting %s workers", Name)
for i := 0; i < workers; i++ {
go wait.Until(c.worker, time.Second, stopCh)
}

<-stopCh
} }


// enqueueByPod enqueues the lifelonglearningjob object of the specified pod. // enqueueByPod enqueues the lifelonglearningjob object of the specified pod.
func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate bool) {
func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
controllerRef := metav1.GetControllerOf(pod) controllerRef := metav1.GetControllerOf(pod)


if controllerRef == nil { if controllerRef == nil {
return return
} }


if controllerRef.Kind != ljControllerKind.Kind {
if controllerRef.Kind != Kind.Kind {
return return
} }


service, err := jc.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name)
service, err := c.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name)
if err != nil { if err != nil {
return return
} }
@@ -126,27 +126,27 @@ func (jc *LifelongLearningJobController) enqueueByPod(pod *v1.Pod, immediate boo
return return
} }


jc.enqueueController(service, immediate)
c.enqueueController(service, immediate)
} }


// When a pod is created, enqueue the controller that manages it and update it's expectations. // When a pod is created, enqueue the controller that manages it and update it's expectations.
func (jc *LifelongLearningJobController) addPod(obj interface{}) {
func (c *Controller) addPod(obj interface{}) {
pod := obj.(*v1.Pod) pod := obj.(*v1.Pod)
if pod.DeletionTimestamp != nil { if pod.DeletionTimestamp != nil {
// on a restart of the controller, it's possible a new pod shows up in a state that // on a restart of the controller, it's possible a new pod shows up in a state that
// is already pending deletion. Prevent the pod from being a creation observation. // is already pending deletion. Prevent the pod from being a creation observation.
jc.deletePod(pod)
c.deletePod(pod)
return return
} }


// backoff to queue when PodFailed // backoff to queue when PodFailed
immediate := pod.Status.Phase != v1.PodFailed immediate := pod.Status.Phase != v1.PodFailed


jc.enqueueByPod(pod, immediate)
c.enqueueByPod(pod, immediate)
} }


// When a pod is updated, figure out what lifelonglearning job manage it and wake them up. // When a pod is updated, figure out what lifelonglearning job manage it and wake them up.
func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) {
func (c *Controller) updatePod(old, cur interface{}) {
curPod := cur.(*v1.Pod) curPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod) oldPod := old.(*v1.Pod)


@@ -155,11 +155,11 @@ func (jc *LifelongLearningJobController) updatePod(old, cur interface{}) {
return return
} }


jc.addPod(curPod)
c.addPod(curPod)
} }


// deletePod enqueues the lifelonglearningjob obj When a pod is deleted // deletePod enqueues the lifelonglearningjob obj When a pod is deleted
func (jc *LifelongLearningJobController) deletePod(obj interface{}) {
func (c *Controller) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod) pod, ok := obj.(*v1.Pod)


// comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
@@ -180,13 +180,13 @@ func (jc *LifelongLearningJobController) deletePod(obj interface{}) {
return return
} }
} }
jc.enqueueByPod(pod, true)
c.enqueueByPod(pod, true)
} }


// obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item, // obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item,
// immediate tells the controller to update the status right away, and should // immediate tells the controller to update the status right away, and should
// happen ONLY when there was a successful pod run. // happen ONLY when there was a successful pod run.
func (jc *LifelongLearningJobController) enqueueController(obj interface{}, immediate bool) {
func (c *Controller) enqueueController(obj interface{}, immediate bool) {
key, err := k8scontroller.KeyFunc(obj) key, err := k8scontroller.KeyFunc(obj)
if err != nil { if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err)) utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
@@ -195,36 +195,36 @@ func (jc *LifelongLearningJobController) enqueueController(obj interface{}, imme


backoff := time.Duration(0) backoff := time.Duration(0)
if !immediate { if !immediate {
backoff = getBackoff(jc.queue, key)
backoff = runtime.GetBackoff(c.queue, key)
} }


jc.queue.AddAfter(key, backoff)
c.queue.AddAfter(key, backoff)
} }


// worker runs a worker thread that just dequeues items, processes them, and marks them done. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
// It enforces that the syncHandler is never invoked concurrently with the same key. // It enforces that the syncHandler is never invoked concurrently with the same key.
func (jc *LifelongLearningJobController) worker() {
for jc.processNextWorkItem() {
func (c *Controller) worker() {
for c.processNextWorkItem() {
} }
} }


func (jc *LifelongLearningJobController) processNextWorkItem() bool {
key, quit := jc.queue.Get()
func (c *Controller) processNextWorkItem() bool {
key, quit := c.queue.Get()
if quit { if quit {
return false return false
} }
defer jc.queue.Done(key)
defer c.queue.Done(key)


forget, err := jc.sync(key.(string))
forget, err := c.sync(key.(string))
if err == nil { if err == nil {
if forget { if forget {
jc.queue.Forget(key)
c.queue.Forget(key)
} }
return true return true
} }


utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err)) utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err))
jc.queue.AddRateLimited(key)
c.queue.AddRateLimited(key)


return true return true
} }
@@ -232,7 +232,7 @@ func (jc *LifelongLearningJobController) processNextWorkItem() bool {
// sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning // sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key. // concurrently with the same key.
func (jc *LifelongLearningJobController) sync(key string) (bool, error) {
func (c *Controller) sync(key string) (bool, error) {
startTime := time.Now() startTime := time.Now()
defer func() { defer func() {
klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime)) klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime))
@@ -245,7 +245,7 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) {
if len(ns) == 0 || len(name) == 0 { if len(ns) == 0 || len(name) == 0 {
return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key) return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key)
} }
sharedLifelongLearningJob, err := jc.jobLister.LifelongLearningJobs(ns).Get(name)
sharedJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name)
if err != nil { if err != nil {
if errors.IsNotFound(err) { if errors.IsNotFound(err) {
klog.V(4).Infof("lifelonglearning job has been deleted: %v", key) klog.V(4).Infof("lifelonglearning job has been deleted: %v", key)
@@ -253,18 +253,18 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) {
} }
return false, err return false, err
} }
lifelonglearningjob := *sharedLifelongLearningJob
job := *sharedJob
// set kind for lifelonglearningjob in case that the kind is None // set kind for lifelonglearningjob in case that the kind is None
lifelonglearningjob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("LifelongLearningJob"))
job.SetGroupVersionKind(Kind)


// lifelonglearningjob first start
if lifelonglearningjob.Status.StartTime == nil {
if job.Status.StartTime == nil {
// job is first in
now := metav1.Now() now := metav1.Now()
lifelonglearningjob.Status.StartTime = &now
job.Status.StartTime = &now
} }


// if lifelonglearningjob was finished previously, we don't want to redo the termination
if IsLifelongLearningJobFinished(&lifelonglearningjob) {
// if job was finished previously, we don't want to redo the termination
if IsJobFinished(&job) {
return true, nil return true, nil
} }


@@ -272,18 +272,18 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) {
jobFailed := false jobFailed := false
needUpdated := false needUpdated := false


// update conditions of lifelonglearning job
needUpdated, err = jc.updateLifelongLearningJobConditions(&lifelonglearningjob)
// transit this job's state machine
needUpdated, err = c.transitJobState(&job)
if err != nil { if err != nil {
klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err)
klog.V(2).Infof("lifelonglearning job %v/%v faied to be updated, err:%s", job.Namespace, job.Name, err)
} }


if needUpdated { if needUpdated {
if err := jc.updateLifelongLearningJobStatus(&lifelonglearningjob); err != nil {
if err := c.updateJobStatus(&job); err != nil {
return forget, err return forget, err
} }


if jobFailed && !IsLifelongLearningJobFinished(&lifelonglearningjob) {
if jobFailed && !IsJobFinished(&job) {
// returning an error will re-enqueue LifelongLearningJob after the backoff period // returning an error will re-enqueue LifelongLearningJob after the backoff period
return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key) return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key)
} }
@@ -294,24 +294,25 @@ func (jc *LifelongLearningJobController) sync(key string) (bool, error) {
return forget, err return forget, err
} }


// updateLifelongLearningJobConditions ensures that conditions of lifelonglearning job can be changed by podstatus
func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lifelonglearningjob *sednav1.LifelongLearningJob) (bool, error) {
// transitJobState transit job to next state
func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) {
var initialType sednav1.LLJobStageConditionType var initialType sednav1.LLJobStageConditionType
var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{
Stage: sednav1.LLJobTrain, Stage: sednav1.LLJobTrain,
Type: initialType, Type: initialType,
} }

var newConditionType sednav1.LLJobStageConditionType var newConditionType sednav1.LLJobStageConditionType
latestCondition.Stage = sednav1.LLJobTrain
var needUpdated = false var needUpdated = false
jobConditions := lifelonglearningjob.Status.Conditions
var podStatus v1.PodPhase = v1.PodUnknown var podStatus v1.PodPhase = v1.PodUnknown
jobConditions := job.Status.Conditions
if len(jobConditions) > 0 { if len(jobConditions) > 0 {
// get latest pod and pod status // get latest pod and pod status
latestCondition = (jobConditions)[len(jobConditions)-1] latestCondition = (jobConditions)[len(jobConditions)-1]
klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", lifelonglearningjob.Namespace, lifelonglearningjob.Name,
klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name,
latestCondition.Stage) latestCondition.Stage)
pod := jc.getSpecifiedPods(lifelonglearningjob, string(latestCondition.Stage))
pod := c.getSpecifiedPods(job, string(latestCondition.Stage))


if pod != nil { if pod != nil {
podStatus = pod.Status.Phase podStatus = pod.Status.Phase
@@ -333,14 +334,14 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif
// include train, eval, deploy pod // include train, eval, deploy pod
var err error var err error
if jobStage == sednav1.LLJobDeploy { if jobStage == sednav1.LLJobDeploy {
err = jc.restartInferPod(lifelonglearningjob)
err = c.restartInferPod(job)
if err != nil { if err != nil {
klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", lifelonglearningjob.Namespace, lifelonglearningjob.Name, err)
klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err)
} else { } else {
klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", lifelonglearningjob.Namespace, lifelonglearningjob.Name)
klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name)
} }
} else if podStatus != v1.PodPending && podStatus != v1.PodRunning { } else if podStatus != v1.PodPending && podStatus != v1.PodRunning {
err = jc.createPod(lifelonglearningjob, jobStage)
err = c.createPod(job, jobStage)
} }
if err != nil { if err != nil {
return needUpdated, err return needUpdated, err
@@ -358,13 +359,13 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif
} else if podStatus == v1.PodSucceeded { } else if podStatus == v1.PodSucceeded {
// watch pod status, if pod completed, set type completed // watch pod status, if pod completed, set type completed
newConditionType = sednav1.LLJobStageCondCompleted newConditionType = sednav1.LLJobStageCondCompleted
klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage)
klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage)
} else if podStatus == v1.PodFailed { } else if podStatus == v1.PodFailed {
newConditionType = sednav1.LLJobStageCondFailed newConditionType = sednav1.LLJobStageCondFailed
klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobStage)
klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage)
} }
case sednav1.LLJobStageCondCompleted: case sednav1.LLJobStageCondCompleted:
jobStage = jc.getNextStage(jobStage)
jobStage = c.getNextStage(jobStage)
newConditionType = sednav1.LLJobStageCondWaiting newConditionType = sednav1.LLJobStageCondWaiting


case sednav1.LLJobStageCondFailed: case sednav1.LLJobStageCondFailed:
@@ -374,34 +375,31 @@ func (jc *LifelongLearningJobController) updateLifelongLearningJobConditions(lif
default: default:
// do nothing when given other type out of cases // do nothing when given other type out of cases
} }
klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", lifelonglearningjob.Namespace, lifelonglearningjob.Name, jobConditions)

klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions)
if latestCondition.Type != newConditionType { if latestCondition.Type != newConditionType {
lifelonglearningjob.Status.Conditions = append(lifelonglearningjob.Status.Conditions, NewLifelongLearningJobCondition(newConditionType, jobStage))
job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(newConditionType, jobStage))
needUpdated = true needUpdated = true
return needUpdated, nil return needUpdated, nil
} }
return needUpdated, nil return needUpdated, nil
} }


// updateLifelongLearningJobStatus ensures that jobstatus can be updated rightly
func (jc *LifelongLearningJobController) updateLifelongLearningJobStatus(lifelonglearningjob *sednav1.LifelongLearningJob) error {
jobClient := jc.client.LifelongLearningJobs(lifelonglearningjob.Namespace)
var err error
for i := 0; i <= ResourceUpdateRetries; i = i + 1 {
var newLifelongLearningJob *sednav1.LifelongLearningJob
newLifelongLearningJob, err = jobClient.Get(context.TODO(), lifelonglearningjob.Name, metav1.GetOptions{})
// updateJobStatus ensures that jobstatus can be updated rightly
func (c *Controller) updateJobStatus(job *sednav1.LifelongLearningJob) error {
jobClient := c.client.LifelongLearningJobs(job.Namespace)
return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
if err != nil { if err != nil {
break
return err
} }
newLifelongLearningJob.Status = lifelonglearningjob.Status
if _, err = jobClient.UpdateStatus(context.TODO(), newLifelongLearningJob, metav1.UpdateOptions{}); err == nil {
break
}
}
return err
newJob.Status = job.Status
_, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{})
return err
})
} }


func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition {
func NewJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition {
return sednav1.LLJobCondition{ return sednav1.LLJobCondition{
Type: conditionType, Type: conditionType,
Status: v1.ConditionTrue, Status: v1.ConditionTrue,
@@ -413,17 +411,17 @@ func NewLifelongLearningJobCondition(conditionType sednav1.LLJobStageConditionTy
} }
} }


func (jc *LifelongLearningJobController) generatePodName(jobName string, workerType string) string {
func (c *Controller) generatePodName(jobName string, workerType string) string {
return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5) return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5)
} }


func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod {
func (c *Controller) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod {
if podType == "Deploy" { if podType == "Deploy" {
podType = InferencePodType
podType = runtime.InferencePodType
} }
var latestPod *v1.Pod var latestPod *v1.Pod
selector, _ := GenerateSelector(job)
pods, err := jc.podStore.Pods(job.Namespace).List(selector)
selector, _ := runtime.GenerateSelector(job)
pods, err := c.podStore.Pods(job.Namespace).List(selector)
if len(pods) == 0 || err != nil { if len(pods) == 0 || err != nil {
return nil return nil
} }
@@ -443,20 +441,20 @@ func (jc *LifelongLearningJobController) getSpecifiedPods(job *sednav1.LifelongL
return latestPod return latestPod
} }


func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLearningJob) error {
inferPod := jc.getSpecifiedPods(job, InferencePodType)
func (c *Controller) restartInferPod(job *sednav1.LifelongLearningJob) error {
inferPod := c.getSpecifiedPods(job, runtime.InferencePodType)
if inferPod == nil { if inferPod == nil {
klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name) klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name)
err := jc.createInferPod(job)
err := c.createInferPod(job)
return err return err
} }
ctx := context.Background() ctx := context.Background()
err := jc.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
if err != nil { if err != nil {
klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
return err return err
} }
err = jc.createInferPod(job)
err = c.createInferPod(job)
if err != nil { if err != nil {
klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err) klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
return err return err
@@ -464,7 +462,7 @@ func (jc *LifelongLearningJobController) restartInferPod(job *sednav1.LifelongLe
return nil return nil
} }


func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage {
func (c *Controller) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage {
switch currentStage { switch currentStage {
case sednav1.LLJobTrain: case sednav1.LLJobTrain:
return sednav1.LLJobEval return sednav1.LLJobEval
@@ -477,9 +475,9 @@ func (jc *LifelongLearningJobController) getNextStage(currentStage sednav1.LLJob
} }
} }


func (jc *LifelongLearningJobController) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
if name != "" { if name != "" {
secret, err = jc.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
if err != nil { if err != nil {
err = fmt.Errorf("failed to get the secret %s for %s: %w", err = fmt.Errorf("failed to get the secret %s for %s: %w",
name, name,
@@ -489,23 +487,23 @@ func (jc *LifelongLearningJobController) getSecret(namespace, name string, owner
return return
} }


func IsLifelongLearningJobFinished(j *sednav1.LifelongLearningJob) bool {
func IsJobFinished(j *sednav1.LifelongLearningJob) bool {
// TODO // TODO
return false return false
} }


func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) {
func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) {
ctx := context.Background() ctx := context.Background()
var podTemplate *v1.PodTemplateSpec var podTemplate *v1.PodTemplateSpec


LLDatasetName := job.Spec.Dataset.Name LLDatasetName := job.Spec.Dataset.Name


dataset, err := jc.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{})
dataset, err := c.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{})
if err != nil { if err != nil {
return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err) return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err)
} }


datasetSecret, err := jc.getSecret(
datasetSecret, err := c.getSecret(
job.Namespace, job.Namespace,
dataset.Spec.CredentialName, dataset.Spec.CredentialName,
fmt.Sprintf("dataset %s", dataset.Name), fmt.Sprintf("dataset %s", dataset.Name),
@@ -514,7 +512,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
return err return err
} }


jobSecret, err := jc.getSecret(
jobSecret, err := c.getSecret(
job.Namespace, job.Namespace,
job.Spec.CredentialName, job.Spec.CredentialName,
fmt.Sprintf("lifelonglearning job %s", job.Name), fmt.Sprintf("lifelonglearning job %s", job.Name),
@@ -526,7 +524,7 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
// get all url for train and eval from data in condition // get all url for train and eval from data in condition
condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data
klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr) klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr)
var cond LifelongLearningCondData
var cond ConditionData
(&cond).Unmarshal([]byte(condDataStr)) (&cond).Unmarshal([]byte(condDataStr))
if cond.Input == nil { if cond.Input == nil {
return fmt.Errorf("empty input from condData") return fmt.Errorf("empty input from condData")
@@ -543,25 +541,25 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
originalDataURLOrIndex = dataset.Spec.URL originalDataURLOrIndex = dataset.Spec.URL
} }


var workerParam *WorkerParam = new(WorkerParam)
var workerParam *runtime.WorkerParam = new(runtime.WorkerParam)
if podtype == sednav1.LLJobTrain { if podtype == sednav1.LLJobTrain {
workerParam.workerType = "Train"
workerParam.WorkerType = "Train"


podTemplate = &job.Spec.TrainSpec.Template podTemplate = &job.Spec.TrainSpec.Template
// Env parameters for train // Env parameters for train


workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
"WORKER_NAME": "train-worker-" + utilrand.String(5), "WORKER_NAME": "train-worker-" + utilrand.String(5),


"LC_SERVER": jc.cfg.LC.Server,
"KB_SERVER": jc.cfg.KB.Server,
"LC_SERVER": c.cfg.LC.Server,
"KB_SERVER": c.cfg.KB.Server,
} }


workerParam.mounts = append(workerParam.mounts,
WorkerMount{
URL: &MountURL{
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: cond.Input.OutputDir, URL: cond.Input.OutputDir,
Secret: jobSecret, Secret: jobSecret,
DownloadByInitializer: false, DownloadByInitializer: false,
@@ -569,8 +567,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
EnvName: "OUTPUT_URL", EnvName: "OUTPUT_URL",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: dataURL, URL: dataURL,
Secret: jobSecret, Secret: jobSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -579,8 +577,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
}, },


// see https://github.com/kubeedge/sedna/issues/35 // see https://github.com/kubeedge/sedna/issues/35
WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
Secret: datasetSecret, Secret: datasetSecret,
URL: originalDataURLOrIndex, URL: originalDataURLOrIndex,
Indirect: dataset.Spec.URL != originalDataURLOrIndex, Indirect: dataset.Spec.URL != originalDataURLOrIndex,
@@ -591,35 +589,35 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
) )
} else { } else {
podTemplate = &job.Spec.EvalSpec.Template podTemplate = &job.Spec.EvalSpec.Template
workerParam.workerType = "Eval"
workerParam.WorkerType = "Eval"


// Configure Env information for eval by initial WorkerParam // Configure Env information for eval by initial WorkerParam
workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
"WORKER_NAME": "eval-worker-" + utilrand.String(5), "WORKER_NAME": "eval-worker-" + utilrand.String(5),


"LC_SERVER": jc.cfg.LC.Server,
"KB_SERVER": jc.cfg.KB.Server,
"LC_SERVER": c.cfg.LC.Server,
"KB_SERVER": c.cfg.KB.Server,
} }


var modelMountURLs []MountURL
var modelMountURLs []runtime.MountURL
for _, url := range inputmodelURLs { for _, url := range inputmodelURLs {
modelMountURLs = append(modelMountURLs, MountURL{
modelMountURLs = append(modelMountURLs, runtime.MountURL{
URL: url, URL: url,
Secret: jobSecret, Secret: jobSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
}) })
} }
workerParam.mounts = append(workerParam.mounts,
WorkerMount{
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URLs: modelMountURLs, URLs: modelMountURLs,
Name: "models", Name: "models",
EnvName: "MODEL_URLS", EnvName: "MODEL_URLS",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: cond.Input.OutputDir, URL: cond.Input.OutputDir,
Secret: jobSecret, Secret: jobSecret,
DownloadByInitializer: false, DownloadByInitializer: false,
@@ -627,8 +625,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
EnvName: "OUTPUT_URL", EnvName: "OUTPUT_URL",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: dataURL, URL: dataURL,
Secret: datasetSecret, Secret: datasetSecret,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -637,8 +635,8 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
EnvName: "TEST_DATASET_URL", EnvName: "TEST_DATASET_URL",
}, },


WorkerMount{
URL: &MountURL{
runtime.WorkerMount{
URL: &runtime.MountURL{
Secret: datasetSecret, Secret: datasetSecret,
URL: originalDataURLOrIndex, URL: originalDataURLOrIndex,
DownloadByInitializer: true, DownloadByInitializer: true,
@@ -651,21 +649,21 @@ func (jc *LifelongLearningJobController) createPod(job *sednav1.LifelongLearning
} }


// set the default policy instead of Always policy // set the default policy instead of Always policy
workerParam.restartPolicy = v1.RestartPolicyOnFailure
workerParam.hostNetwork = true
workerParam.RestartPolicy = v1.RestartPolicyOnFailure
workerParam.HostNetwork = true


// create pod based on podtype // create pod based on podtype
_, err = createPodWithTemplate(jc.kubeClient, job, podTemplate, workerParam)
_, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam)
if err != nil { if err != nil {
return err return err
} }
return return
} }


func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLearningJob) error {
func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error {
inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/") inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/")


jobSecret, err := jc.getSecret(
jobSecret, err := c.getSecret(
job.Namespace, job.Namespace,
job.Spec.CredentialName, job.Spec.CredentialName,
fmt.Sprintf("lifelonglearning job %s", job.Name), fmt.Sprintf("lifelonglearning job %s", job.Name),
@@ -674,10 +672,10 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea
return err return err
} }


var workerParam *WorkerParam = new(WorkerParam)
workerParam.mounts = append(workerParam.mounts,
WorkerMount{
URL: &MountURL{
var workerParam *runtime.WorkerParam = new(runtime.WorkerParam)
workerParam.Mounts = append(workerParam.Mounts,
runtime.WorkerMount{
URL: &runtime.MountURL{
URL: inferModelURL, URL: inferModelURL,
Secret: jobSecret, Secret: jobSecret,
DownloadByInitializer: false, DownloadByInitializer: false,
@@ -687,75 +685,53 @@ func (jc *LifelongLearningJobController) createInferPod(job *sednav1.LifelongLea
}, },
) )


workerParam.env = map[string]string{
workerParam.Env = map[string]string{
"NAMESPACE": job.Namespace, "NAMESPACE": job.Namespace,
"JOB_NAME": job.Name, "JOB_NAME": job.Name,
"WORKER_NAME": "inferworker-" + utilrand.String(5), "WORKER_NAME": "inferworker-" + utilrand.String(5),


"LC_SERVER": jc.cfg.LC.Server,
"LC_SERVER": c.cfg.LC.Server,
} }


workerParam.workerType = InferencePodType
workerParam.hostNetwork = true
workerParam.WorkerType = runtime.InferencePodType
workerParam.HostNetwork = true


// create edge pod // create edge pod
_, err = createPodWithTemplate(jc.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam)
_, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam)
return err return err
} }


// GetName returns the name of the lifelonglearning job controller
func (jc *LifelongLearningJobController) GetName() string {
return "LifelongLearningJobController"
}

// NewLifelongLearningJobController creates a new LifelongLearningJob controller that keeps the relevant pods
// New creates a new LifelongLearningJob controller that keeps the relevant pods
// in sync with their corresponding LifelongLearningJob objects. // in sync with their corresponding LifelongLearningJob objects.
func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureControllerI, error) {
namespace := cfg.Namespace
if namespace == "" {
namespace = metav1.NamespaceAll
}
kubeClient, err := utils.KubeClient()
if err != nil {
return nil, err
}

kubecfg, err := utils.KubeConfig()
if err != nil {
return nil, err
}
crdclient, err := clientset.NewForConfig(kubecfg)
if err != nil {
return nil, err
}

kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, time.Second*30, kubeinformers.WithNamespace(namespace))
func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
cfg := cc.Config


podInformer := kubeInformerFactory.Core().V1().Pods()
podInformer := cc.KubeInformerFactory.Core().V1().Pods()


jobInformerFactory := informers.NewSharedInformerFactoryWithOptions(crdclient, time.Second*30, informers.WithNamespace(namespace))
jobInformer := jobInformerFactory.Sedna().V1alpha1().LifelongLearningJobs()
jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().LifelongLearningJobs()


eventBroadcaster := record.NewBroadcaster() eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})


jc := &LifelongLearningJobController{
kubeClient: kubeClient,
client: crdclient.SednaV1alpha1(),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(DefaultBackOff, MaxBackOff), "lifelonglearningjob"),
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "lifelonglearningjob-controller"}),
jc := &Controller{
kubeClient: cc.KubeClient,
client: cc.SednaClient.SednaV1alpha1(),
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name),
cfg: cfg, cfg: cfg,
} }


jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) { AddFunc: func(obj interface{}) {
jc.enqueueController(obj, true) jc.enqueueController(obj, true)
jc.syncToEdge(watch.Added, obj)
}, },
UpdateFunc: func(old, cur interface{}) { UpdateFunc: func(old, cur interface{}) {
jc.enqueueController(cur, true) jc.enqueueController(cur, true)
jc.syncToEdge(watch.Added, cur)
}, },
DeleteFunc: func(obj interface{}) { DeleteFunc: func(obj interface{}) {
jc.enqueueController(obj, true) jc.enqueueController(obj, true)
jc.syncToEdge(watch.Deleted, obj)
}, },
}) })
jc.jobLister = jobInformer.Lister() jc.jobLister = jobInformer.Lister()
@@ -769,8 +745,5 @@ func NewLifelongLearningJobController(cfg *config.ControllerConfig) (FeatureCont
jc.podStore = podInformer.Lister() jc.podStore = podInformer.Lister()
jc.podStoreSynced = podInformer.Informer().HasSynced jc.podStoreSynced = podInformer.Informer().HasSynced


stopCh := make(chan struct{})
kubeInformerFactory.Start(stopCh)
jobInformerFactory.Start(stopCh)
return jc, err
return jc, nil
} }

+ 164
- 0
pkg/globalmanager/controllers/lifelonglearning/upstream.go View File

@@ -0,0 +1,164 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package lifelonglearning

import (
"context"
"encoding/json"
"fmt"
"strings"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

type Model = runtime.Model

// the data of this condition including the input/output to do the next step
type ConditionData struct {
Input *struct {
// Only one model cases
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`

DataURL string `json:"dataURL,omitempty"`

// the data samples reference will be stored into this URL.
// The content of this url would be:
// # the first uncomment line means the directory
// s3://dataset/
// mnist/0.jpg
// mnist/1.jpg
DataIndexURL string `json:"dataIndexURL,omitempty"`

OutputDir string `json:"outputDir,omitempty"`
} `json:"input,omitempty"`

Output *struct {
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`
} `json:"output,omitempty"`
}

func (cd *ConditionData) joinModelURLs(model *Model, models []Model) []string {
var modelURLs []string
if model != nil {
modelURLs = append(modelURLs, model.GetURL())
} else {
for _, m := range models {
modelURLs = append(modelURLs, m.GetURL())
}
}
return modelURLs
}

func (cd *ConditionData) Unmarshal(data []byte) error {
return json.Unmarshal(data, cd)
}

func (cd ConditionData) Marshal() ([]byte, error) {
return json.Marshal(cd)
}

func (cd *ConditionData) GetInputModelURLs() []string {
return cd.joinModelURLs(cd.Input.Model, cd.Input.Models)
}

func (cd *ConditionData) GetOutputModelURLs() []string {
return cd.joinModelURLs(cd.Output.Model, cd.Output.Models)
}

func (c *Controller) appendStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error {
client := c.client.LifelongLearningJobs(namespace)
return runtime.RetryUpdateStatus(name, namespace, func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
})
}

// updateFromEdge syncs the edge updates to k8s
func (c *Controller) updateFromEdge(name, namespace, operation string, content []byte) error {
var jobStatus struct {
Phase string `json:"phase"`
Status string `json:"status"`
}

err := json.Unmarshal(content, &jobStatus)
if err != nil {
return err
}

// Get the condition data.
// Here unmarshal and marshal immediately to skip the unnecessary fields
var condData ConditionData
err = json.Unmarshal(content, &condData)
if err != nil {
return err
}

condDataBytes, _ := json.Marshal(&condData)

cond := sednav1.LLJobCondition{
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Data: string(condDataBytes),
Message: "reported by lc",
}

switch strings.ToLower(jobStatus.Phase) {
case "train":
cond.Stage = sednav1.LLJobTrain
case "eval":
cond.Stage = sednav1.LLJobEval
case "deploy":
cond.Stage = sednav1.LLJobDeploy
default:
return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase)
}

switch strings.ToLower(jobStatus.Status) {
case "ready":
cond.Type = sednav1.LLJobStageCondReady
case "completed":
cond.Type = sednav1.LLJobStageCondCompleted
case "failed":
cond.Type = sednav1.LLJobStageCondFailed
case "waiting":
cond.Type = sednav1.LLJobStageCondWaiting
default:
return fmt.Errorf("invalid condition type: %v", jobStatus.Status)
}

err = c.appendStatusCondition(name, namespace, cond)
if err != nil {
return fmt.Errorf("failed to append condition, err:%+w", err)
}
return nil
}

func (c *Controller) SetUpstreamHandler(addFunc runtime.UpstreamHandlerAddFunc) error {
return addFunc(KindName, c.updateFromEdge)
}

+ 128
- 0
pkg/globalmanager/controllers/manager.go View File

@@ -0,0 +1,128 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"fmt"
"math/rand"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/klog/v2"

clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned"
sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions"
"github.com/kubeedge/sedna/pkg/globalmanager/config"
"github.com/kubeedge/sedna/pkg/globalmanager/messagelayer"
websocket "github.com/kubeedge/sedna/pkg/globalmanager/messagelayer/ws"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
)

// Manager defines the controller manager
type Manager struct {
Config *config.ControllerConfig
}

// New creates the controller manager
func New(cc *config.ControllerConfig) *Manager {
config.InitConfigure(cc)
return &Manager{
Config: cc,
}
}

func genResyncPeriod(minPeriod time.Duration) time.Duration {
factor := rand.Float64() + 1
// [minPeriod, 2*minPeriod)
return time.Duration(factor * float64(minPeriod.Nanoseconds()))
}

// Start starts the controllers it has managed
func (m *Manager) Start() error {
kubeClient, err := utils.KubeClient()
if err != nil {
return err
}

kubecfg, err := utils.KubeConfig()
if err != nil {
return err
}

sednaClient, err := clientset.NewForConfig(kubecfg)
if err != nil {
return err
}

cfg := m.Config
namespace := cfg.Namespace
if namespace == "" {
namespace = metav1.NamespaceAll
}

// TODO(llhuii): make this period configurable
minResyncPeriod := time.Second * 30

kubeInformerFactory := kubeinformers.NewSharedInformerFactoryWithOptions(kubeClient, genResyncPeriod(minResyncPeriod), kubeinformers.WithNamespace(namespace))

sednaInformerFactory := sednainformers.NewSharedInformerFactoryWithOptions(sednaClient, genResyncPeriod(minResyncPeriod), sednainformers.WithNamespace(namespace))

context := &runtime.ControllerContext{
Config: m.Config,

KubeClient: kubeClient,
KubeInformerFactory: kubeInformerFactory,

SednaClient: sednaClient,
SednaInformerFactory: sednaInformerFactory,
}

uc, _ := NewUpstreamController(context)

downstreamSendFunc := messagelayer.NewContextMessageLayer().SendResourceObject

stopCh := make(chan struct{})

go uc.Run(stopCh)

for name, factory := range NewRegistry() {
f, err := factory(context)
if err != nil {
return fmt.Errorf("failed to initialize controller %s: %v", name, err)
}
f.SetDownstreamSendFunc(downstreamSendFunc)
f.SetUpstreamHandler(uc.Add)

klog.Infof("initialized controller %s", name)
go f.Run(stopCh)
}

kubeInformerFactory.Start(stopCh)
sednaInformerFactory.Start(stopCh)

addr := fmt.Sprintf("%s:%d", m.Config.WebSocket.Address, m.Config.WebSocket.Port)

ws := websocket.NewServer(addr)
err = ws.ListenAndServe()
if err != nil {
close(stopCh)
return fmt.Errorf("failed to listen websocket at %s: %v", addr, err)
}
return nil
}

+ 40
- 0
pkg/globalmanager/controllers/registry.go View File

@@ -0,0 +1,40 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"github.com/kubeedge/sedna/pkg/globalmanager/controllers/dataset"
fl "github.com/kubeedge/sedna/pkg/globalmanager/controllers/federatedlearning"
il "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning"
ji "github.com/kubeedge/sedna/pkg/globalmanager/controllers/jointinference"
ll "github.com/kubeedge/sedna/pkg/globalmanager/controllers/lifelonglearning"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

type FeatureFactory = func(*runtime.ControllerContext) (runtime.FeatureControllerI, error)

type Registry map[string]FeatureFactory

func NewRegistry() Registry {
return Registry{
ji.Name: ji.New,
fl.Name: fl.New,
il.Name: il.New,
ll.Name: ll.New,
dataset.Name: dataset.New,
}
}

+ 105
- 0
pkg/globalmanager/controllers/upstream.go View File

@@ -0,0 +1,105 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"fmt"
"strings"

"k8s.io/klog/v2"

"github.com/kubeedge/sedna/pkg/globalmanager/messagelayer"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
)

// UpstreamController subscribes the updates from edge and syncs to k8s api server
type UpstreamController struct {
messageLayer messagelayer.MessageLayer
updateHandlers map[string]runtime.UpstreamHandler
}

func (uc *UpstreamController) checkOperation(operation string) error {
// current only support the 'status' operation
if operation != "status" {
return fmt.Errorf("unknown operation '%s'", operation)
}
return nil
}

// syncEdgeUpdate receives the updates from edge and syncs these to k8s.
func (uc *UpstreamController) syncEdgeUpdate() {
for {
select {
case <-uc.messageLayer.Done():
klog.Info("Stop sedna upstream loop")
return
default:
}

update, err := uc.messageLayer.ReceiveResourceUpdate()
if err == nil {
err = uc.checkOperation(update.Operation)
}
if err != nil {
klog.Warningf("Ignore update since this err: %+v", err)
continue
}

kind := update.Kind
namespace := update.Namespace
name := update.Name
operation := update.Operation

handler, ok := uc.updateHandlers[kind]
if ok {
err := handler(name, namespace, operation, update.Content)
if err != nil {
klog.Errorf("Error to handle %s %s/%s operation(%s): %+v", kind, namespace, name, operation, err)
}
} else {
klog.Warningf("No handler for resource kind %s", kind)
}
}
}

// Run starts the upstream controller
func (uc *UpstreamController) Run(stopCh <-chan struct{}) {
klog.Info("Start the sedna upstream controller")

uc.syncEdgeUpdate()
<-stopCh
}

func (uc *UpstreamController) Add(kind string, handler runtime.UpstreamHandler) error {
kind = strings.ToLower(kind)
if _, ok := uc.updateHandlers[kind]; ok {
return fmt.Errorf("a upstream handler for kind %s already exists", kind)
}
uc.updateHandlers[kind] = handler

return nil
}

// NewUpstreamController creates a new Upstream controller from config
func NewUpstreamController(cc *runtime.ControllerContext) (*UpstreamController, error) {
uc := &UpstreamController{
messageLayer: messagelayer.NewContextMessageLayer(),
updateHandlers: make(map[string]runtime.UpstreamHandler),
}

return uc, nil
}

+ 0
- 388
pkg/globalmanager/downstream.go View File

@@ -1,388 +0,0 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package globalmanager

import (
"context"
"fmt"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config"
"github.com/kubeedge/sedna/pkg/globalmanager/messagelayer"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
)

// DownstreamController watch kubernetes api server and send the controller resource change to edge
type DownstreamController struct {
// events from watch kubernetes api server
events chan watch.Event

cfg *config.ControllerConfig

client *clientset.SednaV1alpha1Client
kubeClient kubernetes.Interface

messageLayer messagelayer.MessageLayer
}

func (dc *DownstreamController) injectSecret(obj CommonInterface, secretName string) error {
if secretName == "" {
return nil
}

secret, err := dc.kubeClient.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{})
if err != nil {
klog.Warningf("failed to get the secret %s: %+v",
secretName, err)

return err
}
InjectSecretObj(obj, secret)
return err
}

// syncDataset syncs the dataset resources
func (dc *DownstreamController) syncDataset(eventType watch.EventType, dataset *sednav1.Dataset) error {
// Here only propagate to the nodes with non empty name
nodeName := dataset.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}
dc.injectSecret(dataset, dataset.Spec.CredentialName)

return dc.messageLayer.SendResourceObject(nodeName, eventType, dataset)
}

// syncJointInferenceService syncs the joint-inference-service resources
func (dc *DownstreamController) syncJointInferenceService(eventType watch.EventType, joint *sednav1.JointInferenceService) error {
// Here only propagate to the nodes with non empty name
// FIXME: only the case that Spec.NodeName specified is support
nodeName := joint.Spec.EdgeWorker.Template.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}

return dc.messageLayer.SendResourceObject(nodeName, eventType, joint)
}

// syncFederatedLearningJob syncs the federated resources
func (dc *DownstreamController) syncFederatedLearningJob(eventType watch.EventType, job *sednav1.FederatedLearningJob) error {
// broadcast to all nodes specified in spec
nodeset := make(map[string]bool)
for _, trainingWorker := range job.Spec.TrainingWorkers {
// Here only propagate to the nodes with non empty name
if len(trainingWorker.Template.Spec.NodeName) > 0 {
nodeset[trainingWorker.Template.Spec.NodeName] = true
}
}

for nodeName := range nodeset {
dc.messageLayer.SendResourceObject(nodeName, eventType, job)
}
return nil
}

// syncModelWithName will sync the model to the specified node.
// Now called when creating the incrementaljob.
func (dc *DownstreamController) syncModelWithName(nodeName, modelName, namespace string) error {
model, err := dc.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{})
if err != nil {
// TODO: maybe use err.ErrStatus.Code == 404
return fmt.Errorf("model(%s/%s) not found", namespace, modelName)
}

// Since model.Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
if len(model.Kind) == 0 {
model.Kind = "Model"
}

dc.injectSecret(model, model.Spec.CredentialName)

dc.messageLayer.SendResourceObject(nodeName, watch.Added, model)
return nil
}

// syncIncrementalJob syncs the incremental learning jobs
func (dc *DownstreamController) syncIncrementalJob(eventType watch.EventType, job *sednav1.IncrementalLearningJob) error {
jobConditions := job.Status.Conditions
if len(jobConditions) == 0 {
return nil
}

dataName := job.Spec.Dataset.Name
ds, err := dc.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("dataset(%s/%s) not found", job.Namespace, dataName)
}
// LC has dataset object on this node that may call dataset node
dsNodeName := ds.Spec.NodeName

var trainNodeName string
var evalNodeName string

ann := job.GetAnnotations()
if ann != nil {
trainNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobTrain)]
evalNodeName = ann[AnnotationsKeyPrefix+string(sednav1.ILJobEval)]
}

if eventType == watch.Deleted {
// delete jobs from all LCs
for _, v := range []string{dsNodeName, trainNodeName, evalNodeName} {
if v != "" {
dc.messageLayer.SendResourceObject(v, eventType, job)
}
}
return nil
}

latestCondition := jobConditions[len(jobConditions)-1]
currentType := latestCondition.Type
jobStage := latestCondition.Stage

syncModelWithName := func(modelName string) {
if err := dc.syncModelWithName(dsNodeName, modelName, job.Namespace); err != nil {
klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v",
modelName, job.Name, dsNodeName, err)
}
}

syncJobWithNodeName := func(nodeName string) {
if err := dc.messageLayer.SendResourceObject(nodeName, eventType, job); err != nil {
klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v",
job.Name, nodeName, jobStage, err)
}
}

dc.injectSecret(job, job.Spec.CredentialName)

doJobStageEvent := func(modelName string, nodeName string) {
if currentType == sednav1.ILJobStageCondWaiting {
syncJobWithNodeName(dsNodeName)
syncModelWithName(modelName)
} else if currentType == sednav1.ILJobStageCondRunning {
if nodeName != "" {
syncJobWithNodeName(nodeName)
}
} else if currentType == sednav1.ILJobStageCondCompleted || currentType == sednav1.ILJobStageCondFailed {
if nodeName != dsNodeName {
// delete LC's job from nodeName that's different from dataset node when worker's status is completed or failed.
dc.messageLayer.SendResourceObject(nodeName, watch.Deleted, job)
}
}
}

switch jobStage {
case sednav1.ILJobTrain:
doJobStageEvent(job.Spec.InitialModel.Name, trainNodeName)
case sednav1.ILJobEval:
doJobStageEvent(job.Spec.DeploySpec.Model.Name, evalNodeName)
}

return nil
}

// syncLifelongLearningJob syncs the lifelonglearning jobs
func (dc *DownstreamController) syncLifelongLearningJob(eventType watch.EventType, job *sednav1.LifelongLearningJob) error {
// Here only propagate to the nodes with non empty name

// FIXME(llhuii): only the case that all workers having the same nodeName are support,
// will support Spec.NodeSelector and differenect nodeName.
nodeName := job.Spec.TrainSpec.Template.Spec.NodeName
if len(nodeName) == 0 {
return fmt.Errorf("empty node name")
}

dc.injectSecret(job, job.Spec.CredentialName)
dc.messageLayer.SendResourceObject(nodeName, eventType, job)

return nil
}

// sync defines the entrypoint of syncing all resources
func (dc *DownstreamController) sync(stopCh <-chan struct{}) {
for {
select {
case <-stopCh:
klog.Info("Stop controller downstream loop")
return

case e := <-dc.events:

var err error
var kind, namespace, name string
switch t := e.Object.(type) {
case (*sednav1.Dataset):
// Since t.Kind may be empty,
// we need to fix the kind here if missing.
// more details at https://github.com/kubernetes/kubernetes/issues/3030
if len(t.Kind) == 0 {
t.Kind = "Dataset"
}
kind = t.Kind
namespace = t.Namespace
name = t.Name
err = dc.syncDataset(e.Type, t)

case (*sednav1.JointInferenceService):
// TODO: find a good way to avoid these duplicate codes
if len(t.Kind) == 0 {
t.Kind = "JointInferenceService"
}
kind = t.Kind
namespace = t.Namespace
name = t.Name
err = dc.syncJointInferenceService(e.Type, t)

case (*sednav1.FederatedLearningJob):
if len(t.Kind) == 0 {
t.Kind = "FederatedLearningJob"
}
kind = t.Kind
namespace = t.Namespace
name = t.Name
err = dc.syncFederatedLearningJob(e.Type, t)

case (*sednav1.IncrementalLearningJob):
if len(t.Kind) == 0 {
t.Kind = "IncrementalLearningJob"
}
kind = t.Kind
namespace = t.Namespace
name = t.Name
err = dc.syncIncrementalJob(e.Type, t)
case (*sednav1.LifelongLearningJob):
if len(t.Kind) == 0 {
t.Kind = "LifelongLearningJob"
}
kind = t.Kind
namespace = t.Namespace
name = t.Name
err = dc.syncLifelongLearningJob(e.Type, t)
default:
klog.Warningf("object type: %T unsupported", e)
continue
}

if err != nil {
klog.Warningf("Error to sync %s(%s/%s), err: %+v", kind, namespace, name, err)
} else {
klog.V(2).Infof("synced %s(%s/%s)", kind, namespace, name)
}
}
}
}

// watch function watches the crd resources which should by synced to nodes
func (dc *DownstreamController) watch(stopCh <-chan struct{}) {
rh := cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
eventObj := obj.(runtime.Object)
dc.events <- watch.Event{Type: watch.Added, Object: eventObj}
},
UpdateFunc: func(old, cur interface{}) {
// Since we don't support the spec update operation currently,
// so only status updates arrive here and NO propagation to edge.

// Update:
// We sync it to edge when using self-built websocket, and
// this sync isn't needed when we switch out self-built websocket.
dc.events <- watch.Event{Type: watch.Added, Object: cur.(runtime.Object)}
},
DeleteFunc: func(obj interface{}) {
eventObj := obj.(runtime.Object)
dc.events <- watch.Event{Type: watch.Deleted, Object: eventObj}
},
}

client := dc.client.RESTClient()

// make this option configurable
resyncPeriod := time.Second * 60
namespace := dc.cfg.Namespace

// TODO: use the informer
for resourceName, object := range map[string]runtime.Object{
"datasets": &sednav1.Dataset{},
"jointinferenceservices": &sednav1.JointInferenceService{},
"federatedlearningjobs": &sednav1.FederatedLearningJob{},
"incrementallearningjobs": &sednav1.IncrementalLearningJob{},
"lifelonglearningjobs": &sednav1.LifelongLearningJob{},
} {
lw := cache.NewListWatchFromClient(client, resourceName, namespace, fields.Everything())
si := cache.NewSharedInformer(lw, object, resyncPeriod)
si.AddEventHandler(rh)
go si.Run(stopCh)
}
}

// Start starts the controller
func (dc *DownstreamController) Start() error {
stopCh := dc.messageLayer.Done()

// watch is an asynchronous call
dc.watch(stopCh)

// sync is a synchronous call
go dc.sync(stopCh)

return nil
}

// GetName returns the name of the downstream controller
func (dc *DownstreamController) GetName() string {
return "DownstreamController"
}

// NewDownstreamController creates a controller DownstreamController from config
func NewDownstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) {
// TODO: make bufferSize configurable
bufferSize := 10
events := make(chan watch.Event, bufferSize)

crdclient, err := utils.NewCRDClient()
if err != nil {
return nil, fmt.Errorf("create crd client failed with error: %w", err)
}

kubeClient, err := utils.KubeClient()
if err != nil {
return nil, err
}

dc := &DownstreamController{
cfg: cfg,
events: events,
client: crdclient,
kubeClient: kubeClient,
messageLayer: messagelayer.NewContextMessageLayer(),
}

return dc, nil
}

pkg/globalmanager/common.go → pkg/globalmanager/runtime/common.go View File

@@ -14,10 +14,11 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package runtime


import ( import (
"context" "context"
"encoding/json"
"fmt" "fmt"
"math" "math"
"strings" "strings"
@@ -27,16 +28,14 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
"k8s.io/client-go/util/workqueue" "k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
) )


const ( const (
// DefaultBackOff is the default backoff period
DefaultBackOff = 10 * time.Second
// MaxBackOff is the max backoff period
MaxBackOff = 360 * time.Second
bigModelPort int32 = 5000
// ResourceUpdateRetries defines times of retrying to update resource
ResourceUpdateRetries = 3
// resourceUpdateTries defines times of trying to update resource
resourceUpdateTries = 3
) )


// GetNodeIPByName get node ip by node name // GetNodeIPByName get node ip by node name
@@ -62,8 +61,8 @@ func GetNodeIPByName(kubeClient kubernetes.Interface, name string) (string, erro
return "", fmt.Errorf("can't found node ip for node %s", name) return "", fmt.Errorf("can't found node ip for node %s", name)
} }


// getBackoff calc the next wait time for the key
func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration {
// GetBackoff calc the next wait time for the key
func GetBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Duration {
exp := queue.NumRequeues(key) exp := queue.NumRequeues(key)


if exp <= 0 { if exp <= 0 {
@@ -83,7 +82,7 @@ func getBackoff(queue workqueue.RateLimitingInterface, key interface{}) time.Dur
return calculated return calculated
} }


func calcActivePodCount(pods []*v1.Pod) int32 {
func CalcActivePodCount(pods []*v1.Pod) int32 {
var result int32 = 0 var result int32 = 0
for _, p := range pods { for _, p := range pods {
if v1.PodSucceeded != p.Status.Phase && if v1.PodSucceeded != p.Status.Phase &&
@@ -129,3 +128,35 @@ func ConvertK8SValidName(name string) string {


return string(fixName) return string(fixName)
} }

// ConvertMapToMetrics converts the metric map to list of resource Metric
func ConvertMapToMetrics(metric map[string]interface{}) []sednav1.Metric {
var l []sednav1.Metric
for k, v := range metric {
var displayValue string
switch t := v.(type) {
case string:
displayValue = t
default:
// ignore the json marshal error
b, _ := json.Marshal(v)
displayValue = string(b)
}

l = append(l, sednav1.Metric{Key: k, Value: displayValue})
}
return l
}

// RetryUpdateStatus simply retries to call the status update func
func RetryUpdateStatus(name, namespace string, updateStatusFunc func() error) error {
var err error
for try := 1; try <= resourceUpdateTries; try++ {
err = updateStatusFunc()
if err == nil {
return nil
}
klog.Warningf("Error to update %s/%s status, tried %d times: %+v", namespace, name, try, err)
}
return err
}

pkg/globalmanager/secret_injector.go → pkg/globalmanager/runtime/secret_injector.go View File

@@ -14,13 +14,16 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package runtime


import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"


v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
) )


const ( const (
@@ -106,11 +109,18 @@ func MergeSecretEnvs(nowE, newE []v1.EnvVar, overwrite bool) []v1.EnvVar {
return nowE return nowE
} }


func InjectSecretObj(obj CommonInterface, secret *v1.Secret) {
if secret == nil {
func InjectSecretAnnotations(client kubernetes.Interface, obj CommonInterface, secretName string) (err error) {
if len(secretName) == 0 {
return
}
secret, err := client.CoreV1().Secrets(obj.GetNamespace()).Get(context.TODO(), secretName, metav1.GetOptions{})
if err != nil {
return return
} }
return injectSecretObj(obj, secret)
}


func injectSecretObj(obj CommonInterface, secret *v1.Secret) (err error) {
secretData := secret.GetAnnotations() secretData := secret.GetAnnotations()


for k, v := range secret.Data { for k, v := range secret.Data {
@@ -127,4 +137,5 @@ func InjectSecretObj(obj CommonInterface, secret *v1.Secret) {
ann[SecretAnnotationKey] = string(b) ann[SecretAnnotationKey] = string(b)


obj.SetAnnotations(ann) obj.SetAnnotations(ann)
return nil
} }

pkg/globalmanager/storage_initializer_injector.go → pkg/globalmanager/runtime/storage_initializer_injector.go View File

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */


package globalmanager
package runtime


import ( import (
"net/url" "net/url"
@@ -179,7 +179,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) {


hostPathType := v1.HostPathDirectory hostPathType := v1.HostPathDirectory


for _, mount := range workerParam.mounts {
for _, mount := range workerParam.Mounts {
for _, m := range mount.URLs { for _, m := range mount.URLs {
if m.HostPath == "" { if m.HostPath == "" {
continue continue
@@ -240,7 +240,7 @@ func injectHostPathMount(pod *v1.Pod, workerParam *WorkerParam) {


func injectWorkerSecrets(pod *v1.Pod, workerParam *WorkerParam) { func injectWorkerSecrets(pod *v1.Pod, workerParam *WorkerParam) {
var secretEnvs []v1.EnvVar var secretEnvs []v1.EnvVar
for _, mount := range workerParam.mounts {
for _, mount := range workerParam.Mounts {
for _, m := range mount.URLs { for _, m := range mount.URLs {
if m.Disable || m.DownloadByInitializer { if m.Disable || m.DownloadByInitializer {
continue continue
@@ -259,7 +259,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) {


var downloadPairs []string var downloadPairs []string
var secretEnvs []v1.EnvVar var secretEnvs []v1.EnvVar
for _, mount := range workerParam.mounts {
for _, mount := range workerParam.Mounts {
for _, m := range mount.URLs { for _, m := range mount.URLs {
if m.Disable { if m.Disable {
continue continue
@@ -345,7 +345,7 @@ func injectInitializerContainer(pod *v1.Pod, workerParam *WorkerParam) {
func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) { func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) {
var mounts []WorkerMount var mounts []WorkerMount
// parse the mounts and environment key // parse the mounts and environment key
for _, mount := range workerParam.mounts {
for _, mount := range workerParam.Mounts {
var envPaths []string var envPaths []string


if mount.URL != nil { if mount.URL != nil {
@@ -374,13 +374,13 @@ func InjectStorageInitializer(pod *v1.Pod, workerParam *WorkerParam) {
} }


if mount.EnvName != "" { if mount.EnvName != "" {
workerParam.env[mount.EnvName] = strings.Join(
workerParam.Env[mount.EnvName] = strings.Join(
envPaths, urlsFieldSep, envPaths, urlsFieldSep,
) )
} }
} }


workerParam.mounts = mounts
workerParam.Mounts = mounts


// need to call injectInitializerContainer before injectHostPathMount // need to call injectInitializerContainer before injectHostPathMount
// since injectHostPathMount could inject volumeMount to init container // since injectHostPathMount could inject volumeMount to init container

+ 103
- 0
pkg/globalmanager/runtime/types.go View File

@@ -0,0 +1,103 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package runtime

import (
"time"

"github.com/kubeedge/sedna/pkg/globalmanager/config"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8sruntime "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/watch"
kubeinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"

sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned"
sednainformers "github.com/kubeedge/sedna/pkg/client/informers/externalversions"
)

const (
// DefaultBackOff is the default backoff period
DefaultBackOff = 10 * time.Second
// MaxBackOff is the max backoff period
MaxBackOff = 360 * time.Second

// TrainPodType is type of train pod
TrainPodType = "train"
// EvalPodType is type of eval pod
EvalPodType = "eval"
// InferencePodType is type of inference pod
InferencePodType = "inference"

// AnnotationsKeyPrefix defines prefix of key in annotations
AnnotationsKeyPrefix = "sedna.io/"
)

type Model struct {
Format string `json:"format,omitempty"`
URL string `json:"url,omitempty"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
}

func (m *Model) GetURL() string {
return m.URL
}

// CommonInterface describes the commom interface of CRs
type CommonInterface interface {
metav1.Object
schema.ObjectKind
k8sruntime.Object
}

// UpstreamHandler is the function definition for handling the upstream updates,
// i.e. resource updates(mainly status) from LC(running at edge)
type UpstreamHandler = func(namespace, name, operation string, content []byte) error

// UpstreamHandlerAddFunc defines the upstream controller register function for adding handler
type UpstreamHandlerAddFunc = func(kind string, updateHandler UpstreamHandler) error

// DownstreamSendFunc is the send function for feature controllers to sync the resource updates(spec and status) to LC
type DownstreamSendFunc = func(nodeName string, eventType watch.EventType, obj interface{}) error

// BaseControllerI defines the interface of an controller
type BaseControllerI interface {
Run(stopCh <-chan struct{})
}

// FeatureControllerI defines the interface of an AI Feature controller
type FeatureControllerI interface {
BaseControllerI

// SetDownstreamSendFunc sets up the downstream send function in the feature controller
SetDownstreamSendFunc(f DownstreamSendFunc) error

// SetUpstreamHandler sets up the upstream handler function for the feature controller
SetUpstreamHandler(add UpstreamHandlerAddFunc) error
}

// ControllerContext defines the context that all feature controller share and belong to
type ControllerContext struct {
Config *config.ControllerConfig

KubeClient kubernetes.Interface
KubeInformerFactory kubeinformers.SharedInformerFactory

SednaClient sednaclientset.Interface
SednaInformerFactory sednainformers.SharedInformerFactory
}

pkg/globalmanager/worker.go → pkg/globalmanager/runtime/worker.go View File

@@ -1,4 +1,4 @@
package globalmanager
package runtime


import ( import (
"context" "context"
@@ -27,15 +27,15 @@ type WorkerMount struct {


// WorkerParam describes the system-defined parameters of worker // WorkerParam describes the system-defined parameters of worker
type WorkerParam struct { type WorkerParam struct {
mounts []WorkerMount
Mounts []WorkerMount


env map[string]string
workerType string
Env map[string]string
WorkerType string


// if true, force to use hostNetwork // if true, force to use hostNetwork
hostNetwork bool
HostNetwork bool


restartPolicy v1.RestartPolicy
RestartPolicy v1.RestartPolicy
} }


// generateLabels generates labels for an object // generateLabels generates labels for an object
@@ -109,7 +109,7 @@ func CreateKubernetesService(kubeClient kubernetes.Interface, object CommonInter
func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) { func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInterface) {
InjectStorageInitializer(pod, workerParam) InjectStorageInitializer(pod, workerParam)


envs := createEnvVars(workerParam.env)
envs := createEnvVars(workerParam.Env)
for idx := range pod.Spec.Containers { for idx := range pod.Spec.Containers {
pod.Spec.Containers[idx].Env = append( pod.Spec.Containers[idx].Env = append(
pod.Spec.Containers[idx].Env, envs..., pod.Spec.Containers[idx].Env, envs...,
@@ -121,27 +121,27 @@ func injectWorkerParam(pod *v1.Pod, workerParam *WorkerParam, object CommonInter
pod.Labels = make(map[string]string) pod.Labels = make(map[string]string)
} }


for k, v := range generateLabels(object, workerParam.workerType) {
for k, v := range generateLabels(object, workerParam.WorkerType) {
pod.Labels[k] = v pod.Labels[k] = v
} }


pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.workerType) + "-"
pod.GenerateName = object.GetName() + "-" + strings.ToLower(workerParam.WorkerType) + "-"


pod.Namespace = object.GetNamespace() pod.Namespace = object.GetNamespace()


if workerParam.hostNetwork {
if workerParam.HostNetwork {
// FIXME // FIXME
// force to set hostnetwork // force to set hostnetwork
pod.Spec.HostNetwork = true pod.Spec.HostNetwork = true
} }


if pod.Spec.RestartPolicy == "" { if pod.Spec.RestartPolicy == "" {
pod.Spec.RestartPolicy = workerParam.restartPolicy
pod.Spec.RestartPolicy = workerParam.RestartPolicy
} }
} }


// createPodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam
func createPodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) {
// CreatePodWithTemplate creates and returns a pod object given a crd object, pod template, and workerParam
func CreatePodWithTemplate(client kubernetes.Interface, object CommonInterface, spec *v1.PodTemplateSpec, workerParam *WorkerParam) (*v1.Pod, error) {
objectKind := object.GroupVersionKind() objectKind := object.GroupVersionKind()
pod, _ := k8scontroller.GetPodFromTemplate(spec, object, metav1.NewControllerRef(object, objectKind)) pod, _ := k8scontroller.GetPodFromTemplate(spec, object, metav1.NewControllerRef(object, objectKind))
injectWorkerParam(pod, workerParam, object) injectWorkerParam(pod, workerParam, object)
@@ -149,7 +149,7 @@ func createPodWithTemplate(client kubernetes.Interface, object CommonInterface,
createdPod, err := client.CoreV1().Pods(object.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{}) createdPod, err := client.CoreV1().Pods(object.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{})
objectName := object.GetNamespace() + "/" + object.GetName() objectName := object.GetNamespace() + "/" + object.GetName()
if err != nil { if err != nil {
klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.workerType, objectKind, objectName, err)
klog.Warningf("failed to create pod(type=%s) for %s %s, err:%s", workerParam.WorkerType, objectKind, objectName, err)
return nil, err return nil, err
} }
klog.V(2).Infof("pod %s is created successfully for %s %s", createdPod.Name, objectKind, objectName) klog.V(2).Infof("pod %s is created successfully for %s %s", createdPod.Name, objectKind, objectName)

+ 0
- 168
pkg/globalmanager/types.go View File

@@ -1,168 +0,0 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package globalmanager

import (
"encoding/json"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
)

// CommonInterface describes the commom interface of CRs
type CommonInterface interface {
metav1.Object
schema.ObjectKind
runtime.Object
}

// FeatureControllerI defines the interface of an AI Feature controller
type FeatureControllerI interface {
Start() error
GetName() string
}

type Model struct {
Format string `json:"format,omitempty"`
URL string `json:"url,omitempty"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
}

// the data of this condition including the input/output to do the next step
type IncrementalCondData struct {
Input *struct {
// Only one model cases
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`

DataURL string `json:"dataURL,omitempty"`

// the data samples reference will be stored into this URL.
// The content of this url would be:
// # the first uncomment line means the directory
// s3://dataset/
// mnist/0.jpg
// mnist/1.jpg
DataIndexURL string `json:"dataIndexURL,omitempty"`

OutputDir string `json:"outputDir,omitempty"`
} `json:"input,omitempty"`

Output *struct {
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`
} `json:"output,omitempty"`
}

const (
// TrainPodType is type of train pod
TrainPodType = "train"
// EvalPodType is type of eval pod
EvalPodType = "eval"
// InferencePodType is type of inference pod
InferencePodType = "inference"

// AnnotationsKeyPrefix defines prefix of key in annotations
AnnotationsKeyPrefix = "sedna.io/"
)

func (m *Model) GetURL() string {
return m.URL
}

func (cd *IncrementalCondData) joinModelURLs(model *Model, models []Model) []string {
var modelURLs []string
if model != nil {
modelURLs = append(modelURLs, model.GetURL())
} else {
for _, m := range models {
modelURLs = append(modelURLs, m.GetURL())
}
}
return modelURLs
}

func (cd *IncrementalCondData) GetInputModelURLs() []string {
return cd.joinModelURLs(cd.Input.Model, cd.Input.Models)
}

func (cd *IncrementalCondData) GetOutputModelURLs() []string {
return cd.joinModelURLs(cd.Output.Model, cd.Output.Models)
}

func (cd *IncrementalCondData) Unmarshal(data []byte) error {
return json.Unmarshal(data, cd)
}

func (cd IncrementalCondData) Marshal() ([]byte, error) {
return json.Marshal(cd)
}

// the data of this condition including the input/output to do the next step
type LifelongLearningCondData struct {
Input *struct {
// Only one model cases
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`

DataURL string `json:"dataURL,omitempty"`

// the data samples reference will be stored into this URL.
// The content of this url would be:
// # the first uncomment line means the directory
// s3://dataset/
// mnist/0.jpg
// mnist/1.jpg
DataIndexURL string `json:"dataIndexURL,omitempty"`

OutputDir string `json:"outputDir,omitempty"`
} `json:"input,omitempty"`

Output *struct {
Model *Model `json:"model,omitempty"`
Models []Model `json:"models,omitempty"`
} `json:"output,omitempty"`
}

func (cd *LifelongLearningCondData) joinModelURLs(model *Model, models []Model) []string {
var modelURLs []string
if model != nil {
modelURLs = append(modelURLs, model.GetURL())
} else {
for _, m := range models {
modelURLs = append(modelURLs, m.GetURL())
}
}
return modelURLs
}

func (cd *LifelongLearningCondData) Unmarshal(data []byte) error {
return json.Unmarshal(data, cd)
}

func (cd LifelongLearningCondData) Marshal() ([]byte, error) {
return json.Marshal(cd)
}

func (cd *LifelongLearningCondData) GetInputModelURLs() []string {
return cd.joinModelURLs(cd.Input.Model, cd.Input.Models)
}

func (cd *LifelongLearningCondData) GetOutputModelURLs() []string {
return cd.joinModelURLs(cd.Output.Model, cd.Output.Models)
}

+ 0
- 519
pkg/globalmanager/upstream.go View File

@@ -1,519 +0,0 @@
/*
Copyright 2021 The KubeEdge Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package globalmanager

import (
"context"
"encoding/json"
"fmt"
"strings"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"

sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
clientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager/config"
"github.com/kubeedge/sedna/pkg/globalmanager/messagelayer"
"github.com/kubeedge/sedna/pkg/globalmanager/utils"
)

// updateHandler handles the updates from LC(running at edge) to update the
// corresponding resource
type updateHandler func(namespace, name, operation string, content []byte) error

// UpstreamController subscribes the updates from edge and syncs to k8s api server
type UpstreamController struct {
client *clientset.SednaV1alpha1Client
messageLayer messagelayer.MessageLayer
updateHandlers map[string]updateHandler
}

const upstreamStatusUpdateRetries = 3

// retryUpdateStatus simply retries to call the status update func
func retryUpdateStatus(name, namespace string, updateStatusFunc func() error) error {
var err error
for retry := 0; retry <= upstreamStatusUpdateRetries; retry++ {
err = updateStatusFunc()
if err == nil {
return nil
}
klog.Warningf("Error to update %s/%s status, retried %d times: %+v", namespace, name, retry, err)
}
return err
}

func newUnmarshalError(namespace, name, operation string, content []byte) error {
return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content))
}

func checkUpstreamOperation(operation string) error {
// current only support the 'status' operation
if operation != "status" {
return fmt.Errorf("unknown operation %s", operation)
}
return nil
}

// updateDatasetStatus updates the dataset status
func (uc *UpstreamController) updateDatasetStatus(name, namespace string, status sednav1.DatasetStatus) error {
client := uc.client.Datasets(namespace)

if status.UpdateTime == nil {
now := metav1.Now()
status.UpdateTime = &now
}

return retryUpdateStatus(name, namespace, func() error {
dataset, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
dataset.Status = status
_, err = client.UpdateStatus(context.TODO(), dataset, metav1.UpdateOptions{})
return err
})
}

// updateDatasetFromEdge syncs update from edge
func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOperation(operation)
if err != nil {
return err
}

status := sednav1.DatasetStatus{}
err = json.Unmarshal(content, &status)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}

return uc.updateDatasetStatus(name, namespace, status)
}

// convertToMetrics converts the metrics from LCs to resource metrics
func convertToMetrics(m map[string]interface{}) []sednav1.Metric {
var l []sednav1.Metric
for k, v := range m {
var displayValue string
switch t := v.(type) {
case string:
displayValue = t
default:
// ignore the json marshal error
b, _ := json.Marshal(v)
displayValue = string(b)
}

l = append(l, sednav1.Metric{Key: k, Value: displayValue})
}
return l
}

func (uc *UpstreamController) updateJointInferenceMetrics(name, namespace string, metrics []sednav1.Metric) error {
client := uc.client.JointInferenceServices(namespace)

return retryUpdateStatus(name, namespace, func() error {
joint, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
joint.Status.Metrics = metrics
_, err = client.UpdateStatus(context.TODO(), joint, metav1.UpdateOptions{})
return err
})
}

// updateJointInferenceFromEdge syncs the edge updates to k8s
func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOperation(operation)
if err != nil {
return err
}

// Output defines owner output information
type Output struct {
ServiceInfo map[string]interface{} `json:"ownerInfo"`
}

var status struct {
// Phase always should be "inference"
Phase string `json:"phase"`
Status string `json:"status"`
Output *Output `json:"output"`
}

err = json.Unmarshal(content, &status)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}

// TODO: propagate status.Status to k8s

output := status.Output
if output == nil || output.ServiceInfo == nil {
// no output info
klog.Warningf("empty status info for joint inference service %s/%s", namespace, name)
return nil
}

info := output.ServiceInfo

for _, ignoreTimeKey := range []string{
"startTime",
"updateTime",
} {
delete(info, ignoreTimeKey)
}

metrics := convertToMetrics(info)

err = uc.updateJointInferenceMetrics(name, namespace, metrics)
if err != nil {
return fmt.Errorf("failed to update metrics, err:%+w", err)
}
return nil
}

func (uc *UpstreamController) updateModelMetrics(name, namespace string, metrics []sednav1.Metric) error {
client := uc.client.Models(namespace)

return retryUpdateStatus(name, namespace, (func() error {
model, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}

now := metav1.Now()
model.Status.UpdateTime = &now
model.Status.Metrics = metrics
_, err = client.UpdateStatus(context.TODO(), model, metav1.UpdateOptions{})
return err
}))
}

func (uc *UpstreamController) updateModelMetricsByFederatedName(name, namespace string, metrics []sednav1.Metric) error {
client := uc.client.FederatedLearningJobs(namespace)
var err error
federatedLearningJob, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
// federated crd not found
return err
}
modelName := federatedLearningJob.Spec.AggregationWorker.Model.Name
return uc.updateModelMetrics(modelName, namespace, metrics)
}

func (uc *UpstreamController) appendFederatedLearningJobStatusCondition(name, namespace string, cond sednav1.FLJobCondition) error {
client := uc.client.FederatedLearningJobs(namespace)

return retryUpdateStatus(name, namespace, (func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
}))
}

// updateFederatedLearningJobFromEdge updates the federated job's status
func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) {
err = checkUpstreamOperation(operation)
if err != nil {
return err
}

// JobInfo defines the job information
type JobInfo struct {
// Current training round
CurrentRound int `json:"currentRound"`
UpdateTime string `json:"updateTime"`
}

// Output defines job output information
type Output struct {
Models []Model `json:"models"`
JobInfo *JobInfo `json:"ownerInfo"`
}

var status struct {
Phase string `json:"phase"`
Status string `json:"status"`
Output *Output `json:"output"`
}

err = json.Unmarshal(content, &status)
if err != nil {
err = newUnmarshalError(namespace, name, operation, content)
return
}

output := status.Output

if output != nil {
// Update the model's metrics
if len(output.Models) > 0 {
// only one model
model := output.Models[0]
metrics := convertToMetrics(model.Metrics)
if len(metrics) > 0 {
uc.updateModelMetricsByFederatedName(name, namespace, metrics)
}
}

jobInfo := output.JobInfo
// update job info if having any info
if jobInfo != nil && jobInfo.CurrentRound > 0 {
// Find a good place to save the progress info
// TODO: more meaningful reason/message
reason := "DoTraining"
message := fmt.Sprintf("Round %v reaches at %s", jobInfo.CurrentRound, jobInfo.UpdateTime)
cond := NewFLJobCondition(sednav1.FLJobCondTraining, reason, message)
uc.appendFederatedLearningJobStatusCondition(name, namespace, cond)
}
}

return nil
}

func (uc *UpstreamController) appendIncrementalLearningJobStatusCondition(name, namespace string, cond sednav1.ILJobCondition) error {
client := uc.client.IncrementalLearningJobs(namespace)
return retryUpdateStatus(name, namespace, (func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
}))
}

// updateIncrementalLearningFromEdge syncs the edge updates to k8s
func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOperation(operation)
if err != nil {
return err
}
var jobStatus struct {
Phase string `json:"phase"`
Status string `json:"status"`
}

err = json.Unmarshal(content, &jobStatus)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}

// Get the condition data.
// Here unmarshal and marshal immediately to skip the unnecessary fields
var condData IncrementalCondData
err = json.Unmarshal(content, &condData)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}
condDataBytes, _ := json.Marshal(&condData)

cond := sednav1.ILJobCondition{
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Data: string(condDataBytes),
Message: "reported by lc",
}

switch strings.ToLower(jobStatus.Phase) {
case "train":
cond.Stage = sednav1.ILJobTrain
case "eval":
cond.Stage = sednav1.ILJobEval
case "deploy":
cond.Stage = sednav1.ILJobDeploy
default:
return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase)
}

switch strings.ToLower(jobStatus.Status) {
case "ready":
cond.Type = sednav1.ILJobStageCondReady
case "completed":
cond.Type = sednav1.ILJobStageCondCompleted
case "failed":
cond.Type = sednav1.ILJobStageCondFailed
case "waiting":
cond.Type = sednav1.ILJobStageCondWaiting
default:
return fmt.Errorf("invalid condition type: %v", jobStatus.Status)
}

err = uc.appendIncrementalLearningJobStatusCondition(name, namespace, cond)
if err != nil {
return fmt.Errorf("failed to append condition, err:%+w", err)
}
return nil
}

func (uc *UpstreamController) appendLifelongLearningJobStatusCondition(name, namespace string, cond sednav1.LLJobCondition) error {
client := uc.client.LifelongLearningJobs(namespace)
return retryUpdateStatus(name, namespace, func() error {
job, err := client.Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return err
}
job.Status.Conditions = append(job.Status.Conditions, cond)
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{})
return err
})
}

// updateLifelongLearningJobFromEdge syncs the edge updates to k8s
func (uc *UpstreamController) updateLifelongLearningJobFromEdge(name, namespace, operation string, content []byte) error {
err := checkUpstreamOperation(operation)
if err != nil {
return err
}
var jobStatus struct {
Phase string `json:"phase"`
Status string `json:"status"`
}

err = json.Unmarshal(content, &jobStatus)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}

// Get the condition data.
// Here unmarshal and marshal immediately to skip the unnecessary fields
var condData LifelongLearningCondData
err = json.Unmarshal(content, &condData)
if err != nil {
return newUnmarshalError(namespace, name, operation, content)
}
condDataBytes, _ := json.Marshal(&condData)

cond := sednav1.LLJobCondition{
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Now(),
LastTransitionTime: metav1.Now(),
Data: string(condDataBytes),
Message: "reported by lc",
}

switch strings.ToLower(jobStatus.Phase) {
case "train":
cond.Stage = sednav1.LLJobTrain
case "eval":
cond.Stage = sednav1.LLJobEval
case "deploy":
cond.Stage = sednav1.LLJobDeploy
default:
return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase)
}

switch strings.ToLower(jobStatus.Status) {
case "ready":
cond.Type = sednav1.LLJobStageCondReady
case "completed":
cond.Type = sednav1.LLJobStageCondCompleted
case "failed":
cond.Type = sednav1.LLJobStageCondFailed
case "waiting":
cond.Type = sednav1.LLJobStageCondWaiting
default:
return fmt.Errorf("invalid condition type: %v", jobStatus.Status)
}

err = uc.appendLifelongLearningJobStatusCondition(name, namespace, cond)
if err != nil {
return fmt.Errorf("failed to append condition, err:%+w", err)
}
return nil
}

// syncEdgeUpdate receives the updates from edge and syncs these to k8s.
func (uc *UpstreamController) syncEdgeUpdate() {
for {
select {
case <-uc.messageLayer.Done():
klog.Info("Stop sedna upstream loop")
return
default:
}

update, err := uc.messageLayer.ReceiveResourceUpdate()
if err != nil {
klog.Warningf("Ignore update since this err: %+v", err)
continue
}

kind := update.Kind
namespace := update.Namespace
name := update.Name
operation := update.Operation

handler, ok := uc.updateHandlers[kind]
if ok {
err := handler(name, namespace, operation, update.Content)
if err != nil {
klog.Errorf("Error to handle %s %s/%s operation(%s): %+v", kind, namespace, name, operation, err)
}
} else {
klog.Warningf("No handler for resource kind %s", kind)
}
}
}

// Start the upstream controller
func (uc *UpstreamController) Start() error {
klog.Info("Start the sedna upstream controller")

go uc.syncEdgeUpdate()
return nil
}

// GetName returns the name of the upstream controller
func (uc *UpstreamController) GetName() string {
return "UpstreamController"
}

// NewUpstreamController creates a new Upstream controller from config
func NewUpstreamController(cfg *config.ControllerConfig) (FeatureControllerI, error) {
client, err := utils.NewCRDClient()
if err != nil {
return nil, fmt.Errorf("create crd client failed with error: %w", err)
}
uc := &UpstreamController{
client: client,
messageLayer: messagelayer.NewContextMessageLayer(),
}

// NOTE: current no direct model update from edge,
// model update will be triggered by the corresponding training feature
uc.updateHandlers = map[string]updateHandler{
"dataset": uc.updateDatasetFromEdge,
"jointinferenceservice": uc.updateJointInferenceFromEdge,
"federatedlearningjob": uc.updateFederatedLearningJobFromEdge,
"incrementallearningjob": uc.updateIncrementalLearningFromEdge,
"lifelonglearningjob": uc.updateLifelongLearningJobFromEdge,
}

return uc, nil
}

+ 5
- 4
pkg/localcontroller/manager/incrementallearningjob.go View File

@@ -31,7 +31,8 @@ import (


"github.com/kubeedge/sedna/cmd/sedna-lc/app/options" "github.com/kubeedge/sedna/cmd/sedna-lc/app/options"
sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1" sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
"github.com/kubeedge/sedna/pkg/globalmanager"
gmtypes "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning"
"github.com/kubeedge/sedna/pkg/globalmanager/runtime"
"github.com/kubeedge/sedna/pkg/localcontroller/db" "github.com/kubeedge/sedna/pkg/localcontroller/db"
"github.com/kubeedge/sedna/pkg/localcontroller/gmclient" "github.com/kubeedge/sedna/pkg/localcontroller/gmclient"
"github.com/kubeedge/sedna/pkg/localcontroller/storage" "github.com/kubeedge/sedna/pkg/localcontroller/storage"
@@ -435,11 +436,11 @@ func newTrigger(t sednav1.Trigger) (trigger.Base, error) {
func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob, jobStage sednav1.ILJobStage) *ModelInfo { func (im *IncrementalJobManager) getTrainOrEvalModel(job *IncrementalLearningJob, jobStage sednav1.ILJobStage) *ModelInfo {
jobConditions := job.Status.Conditions jobConditions := job.Status.Conditions


// TODO: globalmanager.type changes to common.type for gm and lc
var models []globalmanager.Model
// TODO: runtime.type changes to common.type for gm and lc
var models []runtime.Model


for i := len(jobConditions) - 1; i >= 0; i-- { for i := len(jobConditions) - 1; i >= 0; i-- {
var cond globalmanager.IncrementalCondData
var cond gmtypes.IncrementalCondData
jobCond := jobConditions[i] jobCond := jobConditions[i]
if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted { if jobCond.Stage == sednav1.ILJobTrain && jobCond.Type == sednav1.ILJobStageCondCompleted {
if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil { if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil {


Loading…
Cancel
Save