|
|
@@ -18,6 +18,7 @@ package lifelonglearning |
|
|
|
|
|
|
|
|
import ( |
|
|
import ( |
|
|
"context" |
|
|
"context" |
|
|
|
|
|
"encoding/json" |
|
|
"fmt" |
|
|
"fmt" |
|
|
"strings" |
|
|
"strings" |
|
|
"time" |
|
|
"time" |
|
|
@@ -25,6 +26,7 @@ import ( |
|
|
v1 "k8s.io/api/core/v1" |
|
|
v1 "k8s.io/api/core/v1" |
|
|
"k8s.io/apimachinery/pkg/api/errors" |
|
|
"k8s.io/apimachinery/pkg/api/errors" |
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
|
|
|
|
|
"k8s.io/apimachinery/pkg/types" |
|
|
utilrand "k8s.io/apimachinery/pkg/util/rand" |
|
|
utilrand "k8s.io/apimachinery/pkg/util/rand" |
|
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime" |
|
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime" |
|
|
"k8s.io/apimachinery/pkg/util/wait" |
|
|
"k8s.io/apimachinery/pkg/util/wait" |
|
|
@@ -294,10 +296,51 @@ func (c *Controller) sync(key string) (bool, error) { |
|
|
return forget, err |
|
|
return forget, err |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// setWorkerNodeNameOfJob sets the worker nodeName of the specified job |
|
|
|
|
|
// which is used for downstream to sync job info to the specified LC located in nodeName. |
|
|
|
|
|
func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.LifelongLearningJob, jobStage string, nodeName string) error { |
|
|
|
|
|
key := runtime.AnnotationsKeyPrefix + jobStage |
|
|
|
|
|
|
|
|
|
|
|
return c.addJobAnnotations(job, key, nodeName) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// addJobAnnotations adds info in job annotations |
|
|
|
|
|
func (c *Controller) addJobAnnotations(job *sednav1.LifelongLearningJob, key string, value string) error { |
|
|
|
|
|
ann := job.GetAnnotations() |
|
|
|
|
|
if ann[key] == value { |
|
|
|
|
|
// already set |
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
patchData := metav1.PartialObjectMetadata{ |
|
|
|
|
|
ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{key: value}}} |
|
|
|
|
|
|
|
|
|
|
|
patchDataBytes, err := json.Marshal(&patchData) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return err |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
jobClient := c.client.LifelongLearningJobs(job.Namespace) |
|
|
|
|
|
return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error { |
|
|
|
|
|
newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{}) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return err |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
annotations := newJob.GetAnnotations() |
|
|
|
|
|
if annotations[key] == value { |
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
_, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, patchDataBytes, metav1.PatchOptions{}) |
|
|
|
|
|
return err |
|
|
|
|
|
}) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// transitJobState transit job to next state |
|
|
// transitJobState transit job to next state |
|
|
func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) { |
|
|
func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) { |
|
|
var initialType sednav1.LLJobStageConditionType |
|
|
var initialType sednav1.LLJobStageConditionType |
|
|
var latestCondition = sednav1.LLJobCondition{ |
|
|
|
|
|
|
|
|
var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{ |
|
|
Stage: sednav1.LLJobTrain, |
|
|
Stage: sednav1.LLJobTrain, |
|
|
Type: initialType, |
|
|
Type: initialType, |
|
|
} |
|
|
} |
|
|
@@ -305,14 +348,16 @@ func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, er |
|
|
var newConditionType sednav1.LLJobStageConditionType |
|
|
var newConditionType sednav1.LLJobStageConditionType |
|
|
var needUpdated = false |
|
|
var needUpdated = false |
|
|
|
|
|
|
|
|
var podStatus = v1.PodUnknown |
|
|
|
|
|
|
|
|
var podStatus v1.PodPhase = v1.PodUnknown |
|
|
|
|
|
var pod *v1.Pod |
|
|
|
|
|
|
|
|
jobConditions := job.Status.Conditions |
|
|
jobConditions := job.Status.Conditions |
|
|
if len(jobConditions) > 0 { |
|
|
if len(jobConditions) > 0 { |
|
|
// get latest pod and pod status |
|
|
// get latest pod and pod status |
|
|
latestCondition = (jobConditions)[len(jobConditions)-1] |
|
|
latestCondition = (jobConditions)[len(jobConditions)-1] |
|
|
klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name, |
|
|
klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name, |
|
|
latestCondition.Stage) |
|
|
latestCondition.Stage) |
|
|
pod := c.getSpecifiedPods(job, string(latestCondition.Stage)) |
|
|
|
|
|
|
|
|
pod = c.getSpecifiedPods(job, string(latestCondition.Stage)) |
|
|
|
|
|
|
|
|
if pod != nil { |
|
|
if pod != nil { |
|
|
podStatus = pod.Status.Phase |
|
|
podStatus = pod.Status.Phase |
|
|
@@ -337,25 +382,30 @@ func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, er |
|
|
err = c.restartInferPod(job) |
|
|
err = c.restartInferPod(job) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) |
|
|
klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err) |
|
|
} else { |
|
|
|
|
|
klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) |
|
|
|
|
|
|
|
|
return needUpdated, err |
|
|
} |
|
|
} |
|
|
} else if podStatus != v1.PodPending && podStatus != v1.PodRunning { |
|
|
|
|
|
err = c.createPod(job, jobStage) |
|
|
|
|
|
} |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return needUpdated, err |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name) |
|
|
|
|
|
newConditionType = sednav1.LLJobStageCondCompleted |
|
|
|
|
|
} else { |
|
|
|
|
|
if podStatus != v1.PodPending && podStatus != v1.PodRunning { |
|
|
|
|
|
err = c.createPod(job, jobStage) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return needUpdated, err |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
newConditionType = sednav1.LLJobStageCondStarting |
|
|
} |
|
|
} |
|
|
newConditionType = sednav1.LLJobStageCondStarting |
|
|
|
|
|
|
|
|
|
|
|
case sednav1.LLJobStageCondStarting, sednav1.LLJobStageCondRunning: |
|
|
case sednav1.LLJobStageCondStarting, sednav1.LLJobStageCondRunning: |
|
|
if podStatus == v1.PodRunning { |
|
|
if podStatus == v1.PodRunning { |
|
|
if jobStage == sednav1.LLJobDeploy { |
|
|
|
|
|
newConditionType = sednav1.LLJobStageCondCompleted |
|
|
|
|
|
} else { |
|
|
|
|
|
// watch pod status, if pod running, set type running |
|
|
|
|
|
newConditionType = sednav1.LLJobStageCondRunning |
|
|
|
|
|
|
|
|
// add nodeName to job |
|
|
|
|
|
if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil { |
|
|
|
|
|
return needUpdated, err |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// watch pod status, if pod running, set type running |
|
|
|
|
|
newConditionType = sednav1.LLJobStageCondRunning |
|
|
} else if podStatus == v1.PodSucceeded { |
|
|
} else if podStatus == v1.PodSucceeded { |
|
|
// watch pod status, if pod completed, set type completed |
|
|
// watch pod status, if pod completed, set type completed |
|
|
newConditionType = sednav1.LLJobStageCondCompleted |
|
|
newConditionType = sednav1.LLJobStageCondCompleted |
|
|
@@ -541,7 +591,7 @@ func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1 |
|
|
originalDataURLOrIndex = dataset.Spec.URL |
|
|
originalDataURLOrIndex = dataset.Spec.URL |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
var workerParam = new(runtime.WorkerParam) |
|
|
|
|
|
|
|
|
var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) |
|
|
if podtype == sednav1.LLJobTrain { |
|
|
if podtype == sednav1.LLJobTrain { |
|
|
workerParam.WorkerType = "Train" |
|
|
workerParam.WorkerType = "Train" |
|
|
|
|
|
|
|
|
@@ -672,7 +722,7 @@ func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error { |
|
|
return err |
|
|
return err |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
var workerParam = new(runtime.WorkerParam) |
|
|
|
|
|
|
|
|
var workerParam *runtime.WorkerParam = new(runtime.WorkerParam) |
|
|
workerParam.Mounts = append(workerParam.Mounts, |
|
|
workerParam.Mounts = append(workerParam.Mounts, |
|
|
runtime.WorkerMount{ |
|
|
runtime.WorkerMount{ |
|
|
URL: &runtime.MountURL{ |
|
|
URL: &runtime.MountURL{ |
|
|
|