|
|
@@ -4,7 +4,9 @@ import ( |
|
|
"context" |
|
|
"context" |
|
|
"encoding/json" |
|
|
"encoding/json" |
|
|
"fmt" |
|
|
"fmt" |
|
|
|
|
|
"strings" |
|
|
|
|
|
|
|
|
|
|
|
v1 "k8s.io/api/core/v1" |
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
|
|
"k8s.io/klog/v2" |
|
|
"k8s.io/klog/v2" |
|
|
|
|
|
|
|
|
@@ -45,7 +47,7 @@ func newUnmarshalError(namespace, name, operation string, content []byte) error |
|
|
return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content)) |
|
|
return fmt.Errorf("Unable to unmarshal content for (%s/%s) operation: '%s', content: '%+v'", namespace, name, operation, string(content)) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
func checkUpstreamOpeation(operation string) error { |
|
|
|
|
|
|
|
|
func checkUpstreamOperation(operation string) error { |
|
|
// current only support the 'status' operation |
|
|
// current only support the 'status' operation |
|
|
if operation != "status" { |
|
|
if operation != "status" { |
|
|
return fmt.Errorf("unknown operation %s", operation) |
|
|
return fmt.Errorf("unknown operation %s", operation) |
|
|
@@ -75,7 +77,7 @@ func (uc *UpstreamController) updateDatasetStatus(name, namespace string, status |
|
|
|
|
|
|
|
|
// updateDatasetFromEdge syncs update from edge |
|
|
// updateDatasetFromEdge syncs update from edge |
|
|
func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { |
|
|
func (uc *UpstreamController) updateDatasetFromEdge(name, namespace, operation string, content []byte) error { |
|
|
err := checkUpstreamOpeation(operation) |
|
|
|
|
|
|
|
|
err := checkUpstreamOperation(operation) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
return err |
|
|
return err |
|
|
} |
|
|
} |
|
|
@@ -124,7 +126,7 @@ func (uc *UpstreamController) updateJointInferenceMetrics(name, namespace string |
|
|
|
|
|
|
|
|
// updateJointInferenceFromEdge syncs the edge updates to k8s |
|
|
// updateJointInferenceFromEdge syncs the edge updates to k8s |
|
|
func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error { |
|
|
func (uc *UpstreamController) updateJointInferenceFromEdge(name, namespace, operation string, content []byte) error { |
|
|
err := checkUpstreamOpeation(operation) |
|
|
|
|
|
|
|
|
err := checkUpstreamOperation(operation) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
return err |
|
|
return err |
|
|
} |
|
|
} |
|
|
@@ -218,7 +220,7 @@ func (uc *UpstreamController) appendFederatedLearningJobStatusCondition(name, na |
|
|
|
|
|
|
|
|
// updateFederatedLearningJobFromEdge updates the federated job's status |
|
|
// updateFederatedLearningJobFromEdge updates the federated job's status |
|
|
func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) { |
|
|
func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace, operation string, content []byte) (err error) { |
|
|
err = checkUpstreamOpeation(operation) |
|
|
|
|
|
|
|
|
err = checkUpstreamOperation(operation) |
|
|
if err != nil { |
|
|
if err != nil { |
|
|
return err |
|
|
return err |
|
|
} |
|
|
} |
|
|
@@ -276,6 +278,83 @@ func (uc *UpstreamController) updateFederatedLearningJobFromEdge(name, namespace |
|
|
return nil |
|
|
return nil |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
func (uc *UpstreamController) appendIncrementalLearningJobStatusCondition(name, namespace string, cond neptunev1.ILJobCondition) error { |
|
|
|
|
|
client := uc.client.IncrementalLearningJobs(namespace) |
|
|
|
|
|
return retryUpdateStatus(name, namespace, (func() error { |
|
|
|
|
|
job, err := client.Get(context.TODO(), name, metav1.GetOptions{}) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return err |
|
|
|
|
|
} |
|
|
|
|
|
job.Status.Conditions = append(job.Status.Conditions, cond) |
|
|
|
|
|
_, err = client.UpdateStatus(context.TODO(), job, metav1.UpdateOptions{}) |
|
|
|
|
|
return err |
|
|
|
|
|
})) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// updateIncrementalLearningFromEdge syncs the edge updates to k8s |
|
|
|
|
|
func (uc *UpstreamController) updateIncrementalLearningFromEdge(name, namespace, operation string, content []byte) error { |
|
|
|
|
|
err := checkUpstreamOperation(operation) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return err |
|
|
|
|
|
} |
|
|
|
|
|
var jobStatus struct { |
|
|
|
|
|
Phase string `json:"phase"` |
|
|
|
|
|
Status string `json:"status"` |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
err = json.Unmarshal(content, &jobStatus) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return newUnmarshalError(namespace, name, operation, content) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Get the condition data. |
|
|
|
|
|
// Here unmarshal and marshal immediately to skip the unnecessary fields |
|
|
|
|
|
var condData IncrementalCondData |
|
|
|
|
|
err = json.Unmarshal(content, &condData) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return newUnmarshalError(namespace, name, operation, content) |
|
|
|
|
|
} |
|
|
|
|
|
condDataBytes, _ := json.Marshal(&condData) |
|
|
|
|
|
|
|
|
|
|
|
cond := neptunev1.ILJobCondition{ |
|
|
|
|
|
Status: v1.ConditionTrue, |
|
|
|
|
|
LastHeartbeatTime: metav1.Now(), |
|
|
|
|
|
LastTransitionTime: metav1.Now(), |
|
|
|
|
|
Data: string(condDataBytes), |
|
|
|
|
|
Message: "reported by lc", |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
switch strings.ToLower(jobStatus.Phase) { |
|
|
|
|
|
case "train": |
|
|
|
|
|
cond.Stage = neptunev1.ILJobTrain |
|
|
|
|
|
case "eval": |
|
|
|
|
|
cond.Stage = neptunev1.ILJobEval |
|
|
|
|
|
case "deploy": |
|
|
|
|
|
cond.Stage = neptunev1.ILJobDeploy |
|
|
|
|
|
default: |
|
|
|
|
|
return fmt.Errorf("invalid condition stage: %v", jobStatus.Phase) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
switch strings.ToLower(jobStatus.Status) { |
|
|
|
|
|
case "ready": |
|
|
|
|
|
cond.Type = neptunev1.ILJobStageCondReady |
|
|
|
|
|
case "completed": |
|
|
|
|
|
cond.Type = neptunev1.ILJobStageCondCompleted |
|
|
|
|
|
case "failed": |
|
|
|
|
|
cond.Type = neptunev1.ILJobStageCondFailed |
|
|
|
|
|
case "waiting": |
|
|
|
|
|
cond.Type = neptunev1.ILJobStageCondWaiting |
|
|
|
|
|
default: |
|
|
|
|
|
return fmt.Errorf("invalid condition type: %v", jobStatus.Status) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
err = uc.appendIncrementalLearningJobStatusCondition(name, namespace, cond) |
|
|
|
|
|
if err != nil { |
|
|
|
|
|
return fmt.Errorf("failed to append condition, err:%+w", err) |
|
|
|
|
|
} |
|
|
|
|
|
return nil |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
// syncEdgeUpdate receives the updates from edge and syncs these to k8s. |
|
|
// syncEdgeUpdate receives the updates from edge and syncs these to k8s. |
|
|
func (uc *UpstreamController) syncEdgeUpdate() { |
|
|
func (uc *UpstreamController) syncEdgeUpdate() { |
|
|
for { |
|
|
for { |
|
|
@@ -336,9 +415,10 @@ func NewUpstreamController(cfg *config.ControllerConfig) (FeatureControllerI, er |
|
|
// NOTE: current no direct model update from edge, |
|
|
// NOTE: current no direct model update from edge, |
|
|
// model update will be triggered by the corresponding training feature |
|
|
// model update will be triggered by the corresponding training feature |
|
|
uc.updateHandlers = map[string]updateHandler{ |
|
|
uc.updateHandlers = map[string]updateHandler{ |
|
|
"dataset": uc.updateDatasetFromEdge, |
|
|
|
|
|
"jointinferenceservice": uc.updateJointInferenceFromEdge, |
|
|
|
|
|
"federatedlearningjob": uc.updateFederatedLearningJobFromEdge, |
|
|
|
|
|
|
|
|
"dataset": uc.updateDatasetFromEdge, |
|
|
|
|
|
"jointinferenceservice": uc.updateJointInferenceFromEdge, |
|
|
|
|
|
"federatedlearningjob": uc.updateFederatedLearningJobFromEdge, |
|
|
|
|
|
"incrementallearningjob": uc.updateIncrementalLearningFromEdge, |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return uc, nil |
|
|
return uc, nil |
|
|
|