- /*
- Copyright 2021 The KubeEdge Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
- package incrementallearning
-
- import (
- "bufio"
- "encoding/json"
- "fmt"
- "io/ioutil"
- "os"
- "path"
- "path/filepath"
- "strconv"
- "strings"
- "sync"
- "time"
-
- "k8s.io/klog/v2"
-
- "github.com/kubeedge/sedna/cmd/sedna-lc/app/options"
- sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
- gmtypes "github.com/kubeedge/sedna/pkg/globalmanager/controllers/incrementallearning"
- "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
- "github.com/kubeedge/sedna/pkg/localcontroller/db"
- clienttypes "github.com/kubeedge/sedna/pkg/localcontroller/gmclient"
- "github.com/kubeedge/sedna/pkg/localcontroller/managers/dataset"
- "github.com/kubeedge/sedna/pkg/localcontroller/managers/model"
- "github.com/kubeedge/sedna/pkg/localcontroller/storage"
- "github.com/kubeedge/sedna/pkg/localcontroller/trigger"
- "github.com/kubeedge/sedna/pkg/localcontroller/util"
- workertypes "github.com/kubeedge/sedna/pkg/localcontroller/worker"
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
- )
-
- // IncrementalLearningJob defines config for incremental-learning-job
- type Job struct {
- sednav1.IncrementalLearningJob
- JobConfig *JobConfig
- }
-
- // JobConfig defines config for incremental-learning-job
- type JobConfig struct {
- UniqueIdentifier string
- Rounds int
- TrainTrigger trigger.Base
- DeployTrigger trigger.Base
- TriggerTime time.Time
- TrainTriggerStatus string
- EvalTriggerStatus string
- DeployTriggerStatus string
- HotModelUpdateDeployTriggerStatus string
- TrainDataURL string
- EvalDataURL string
- OutputDir string
- OutputConfig *OutputConfig
- DataSamples *DataSamples
- TrainModel *Model
- DeployModel *Model
- EvalModel *Model
- EvalResult []Model
- Lock sync.Mutex
- Dataset *dataset.Dataset
- Storage storage.Storage
- Done chan struct{}
- }
-
- type Model = clienttypes.Model
-
- // OutputConfig defines config for job output
- type OutputConfig struct {
- SamplesOutput map[string]string `json:"trainData"`
- TrainOutput string `json:"trainOutput"`
- EvalOutput string `json:"evalOutput"`
- }
-
- // DataSamples defines samples information
- type DataSamples struct {
- PreviousNumbers int
- TrainSamples []string
- EvalVersionSamples [][]string
- EvalSamples []string
- }
-
- // IncrementalLearningJob defines incremental-learning-job manager
- type Manager struct {
- Client clienttypes.ClientI
- WorkerMessageChannel chan workertypes.MessageContent
- DatasetManager *dataset.Manager
- ModelManager *model.Manager
- IncrementalJobMap map[string]*Job
- VolumeMountPrefix string
- }
-
- const (
- // JobIterationIntervalSeconds is interval time of each iteration of job
- JobIterationIntervalSeconds = 10
- // DatasetHandlerIntervalSeconds is interval time of handling dataset
- DatasetHandlerIntervalSeconds = 10
- // EvalSamplesCapacity is capacity of eval samples
- EvalSamplesCapacity = 5
- //KindName is kind of incremental-learning-job resource
- KindName = "incrementallearningjob"
-
- // TriggerReadyStatus is the ready status about trigger
- TriggerReadyStatus = "ready"
- // TriggerCompletedStatus is the completed status about trigger
- TriggerCompletedStatus = "completed"
-
- AnnotationsRoundsKey = "sedna.io/rounds"
- AnnotationsNumberOfSamplesKey = "sedna.io/number-of-samples"
- AnnotationsDataFileOfEvalKey = "sedna.io/data-file-of-eval"
- )
-
- // New creates a incremental-learning-job manager
- func New(client clienttypes.ClientI, datasetManager *dataset.Manager,
- modelManager *model.Manager, options *options.LocalControllerOptions) *Manager {
- im := Manager{
- Client: client,
- WorkerMessageChannel: make(chan workertypes.MessageContent, workertypes.MessageChannelCacheSize),
- DatasetManager: datasetManager,
- ModelManager: modelManager,
- IncrementalJobMap: make(map[string]*Job),
- VolumeMountPrefix: options.VolumeMountPrefix,
- }
-
- return &im
- }
-
- // Start starts incremental-learning-job manager
- func (im *Manager) Start() error {
- go im.monitorWorker()
-
- return nil
- }
-
- // trainTask starts training task
- func (im *Manager) trainTask(job *Job) error {
- jobConfig := job.JobConfig
-
- latestCond := im.getLatestCondition(job)
- jobStage := latestCond.Stage
- currentType := latestCond.Type
-
- if currentType == sednav1.ILJobStageCondWaiting {
- var err error
-
- err = im.loadDataset(job)
- if err != nil || jobConfig.Dataset == nil || jobConfig.Dataset.DataSource == nil {
- return fmt.Errorf("job(%s) failed to load dataset, and waiting it: %w",
- jobConfig.UniqueIdentifier, err)
- }
-
- if jobConfig.Dataset == nil || jobConfig.Dataset.DataSource == nil {
- return fmt.Errorf("job(%s)'s dataset not ready", jobConfig.UniqueIdentifier)
- }
-
- err = im.loadTrainModel(job)
- if err != nil {
- return fmt.Errorf("failed to sync train model, and waiting it: %w", err)
- }
-
- initTriggerStatus(jobConfig)
-
- if jobConfig.TrainTriggerStatus == TriggerReadyStatus {
- payload, ok, err := im.triggerTrainTask(job)
- if !ok {
- return nil
- }
-
- if err != nil {
- klog.Errorf("job(%s) failed to complete the %sing phase triggering task: %v",
- jobConfig.UniqueIdentifier, jobStage, err)
- job.JobConfig.Rounds--
- return err
- }
-
- err = im.Client.WriteMessage(payload, job.getHeader())
- if err != nil {
- klog.Errorf("job(%s) failed to write message: %v", jobConfig.UniqueIdentifier, err)
- job.JobConfig.Rounds--
- return err
- }
-
- forwardSamples(jobConfig, jobStage)
-
- err = im.saveJobToDB(job)
- if err != nil {
- klog.Errorf("job(%s) failed to save job to db: %v",
- jobConfig.UniqueIdentifier, err)
- // continue anyway
- }
-
- jobConfig.TrainTriggerStatus = TriggerCompletedStatus
- klog.Infof("job(%s) completed the %sing phase triggering task successfully",
- jobConfig.UniqueIdentifier, jobStage)
- }
- }
-
- return nil
- }
-
- // evalTask starts eval task
- func (im *Manager) evalTask(job *Job) error {
- jobConfig := job.JobConfig
-
- latestCond := im.getLatestCondition(job)
- jobStage := latestCond.Stage
- currentType := latestCond.Type
-
- if currentType == sednav1.ILJobStageCondWaiting {
- var err error
-
- err = im.loadDataset(job)
- if err != nil || jobConfig.Dataset == nil || jobConfig.Dataset.DataSource == nil {
- return fmt.Errorf("job(%s) failed to load dataset, and waiting it: %w",
- jobConfig.UniqueIdentifier, err)
- }
-
- err = im.loadDeployModel(job)
- if err != nil {
- return fmt.Errorf("failed to sync deploy model, and waiting it: %w", err)
- }
-
- if job.Spec.EvalSpec.InitialModel != nil {
- err = im.loadEvalModel(job)
- if err != nil {
- return fmt.Errorf("failed to sync initial eval model, and waiting it: %w", err)
- }
- } else {
- if jobConfig.EvalModel == nil {
- jobConfig.EvalModel = jobConfig.DeployModel
- }
- }
-
- if jobConfig.EvalTriggerStatus == TriggerReadyStatus {
- payload, err := im.triggerEvalTask(job)
- if err != nil {
- klog.Errorf("job(%s) completed the %sing phase triggering task failed: %v",
- jobConfig.UniqueIdentifier, jobStage, err)
- return err
- }
-
- err = im.Client.WriteMessage(payload, job.getHeader())
- if err != nil {
- klog.Errorf("job(%s) failed to write message: %v", jobConfig.UniqueIdentifier, err)
- return err
- }
-
- forwardSamples(jobConfig, jobStage)
-
- jobConfig.EvalTriggerStatus = TriggerCompletedStatus
- klog.Infof("job(%s) completed the %sing phase triggering task successfully",
- jobConfig.UniqueIdentifier, jobStage)
- }
- }
-
- return nil
- }
-
- // hotModelUpdateDeployTask starts deploy task when job supports hot model update
- func (im *Manager) hotModelUpdateDeployTask(job *Job) error {
- var localModelConfigFile string
- if v, ok := job.ObjectMeta.Annotations[runtime.ModelHotUpdateAnnotationsKey]; ok {
- localModelConfigFile = v
- } else {
- return nil
- }
-
- if job.JobConfig.HotModelUpdateDeployTriggerStatus == TriggerReadyStatus {
- var err error
- err = im.loadDeployModel(job)
- if err != nil {
- return fmt.Errorf("failed to sync deploy model, and waiting it: %w", err)
- }
-
- if job.Spec.EvalSpec.InitialModel != nil {
- err = im.loadEvalModel(job)
- if err != nil {
- return fmt.Errorf("failed to sync initial eval model, and waiting it: %w", err)
- }
- }
-
- trainedModel := im.getModelFromJobConditions(job, sednav1.ILJobDeploy)
- deployModel := job.JobConfig.DeployModel
-
- trainedModelURL := trainedModel.URL
- modelName := filepath.Base(trainedModelURL)
- localHostDir := filepath.Dir(localModelConfigFile)
- localHostModelFile := filepath.Join(localHostDir, modelName)
-
- modelFile := util.AddPrefixPath(im.VolumeMountPrefix, localHostModelFile)
- if err := im.updateDeployModelFile(job, trainedModelURL, modelFile); err != nil {
- return err
- }
-
- deployModelURL := deployModel.URL
- if err := im.updateDeployModelFile(job, trainedModelURL, deployModelURL); err != nil {
- return err
- }
-
- evalModel := job.JobConfig.EvalModel
- if evalModel != nil {
- newEvalModel := im.getModelFromJobConditions(job, sednav1.ILJobEval)
- if err := im.updateDeployModelFile(job, newEvalModel.URL, evalModel.URL); err != nil {
- return err
- }
- }
-
- config := map[string]map[string]string{
- "model_config": {
- "model_path": strings.Replace(localHostModelFile, localHostDir,
- runtime.ModelHotUpdateContainerPrefix, 1),
- "model_update_time": time.Now().String(),
- },
- }
-
- jsonConfig, err := json.MarshalIndent(config, "", " ")
- if err != nil {
- return err
- }
-
- modelConfigFile := util.AddPrefixPath(im.VolumeMountPrefix, localModelConfigFile)
- // overwrite file
- err = ioutil.WriteFile(modelConfigFile, jsonConfig, 0644)
- if err != nil {
- klog.Errorf("job(%s) write model config file(url=%s) failed in deploy phase: %v",
- job.JobConfig.UniqueIdentifier, modelConfigFile, err)
- return err
- }
-
- job.JobConfig.HotModelUpdateDeployTriggerStatus = TriggerCompletedStatus
- klog.V(4).Infof("job(%s) write model config file(url=%s) successfully in deploy phase",
- job.JobConfig.UniqueIdentifier, modelConfigFile)
- klog.Infof("job(%s) completed the %s task successfully", job.JobConfig.UniqueIdentifier, sednav1.ILJobDeploy)
- }
-
- return nil
- }
-
- // deployTask starts deploy task
- func (im *Manager) deployTask(job *Job) error {
- if job.JobConfig.DeployTriggerStatus == TriggerReadyStatus {
- if err := im.loadDeployModel(job); err != nil {
- return fmt.Errorf("failed to sync deploy model, and waiting it: %w", err)
- }
-
- if !job.Spec.DeploySpec.Model.HotUpdateEnabled && job.Spec.EvalSpec.InitialModel != nil {
- err := im.loadEvalModel(job)
- if err != nil {
- return fmt.Errorf("failed to sync initial eval model, and waiting it: %w", err)
- }
- }
-
- jobConfig := job.JobConfig
- var err error
- var neededDeploy bool
-
- neededDeploy, err = im.triggerDeployTask(job)
- status := clienttypes.UpstreamMessage{Phase: string(sednav1.ILJobDeploy)}
-
- if err == nil && neededDeploy {
- var models []Model
- trainedModel := im.getModelFromJobConditions(job, sednav1.ILJobDeploy)
- deployModel := jobConfig.DeployModel
- models = append(models, *trainedModel, *deployModel)
-
- if !job.Spec.DeploySpec.Model.HotUpdateEnabled {
- err = im.updateDeployModelFile(job, trainedModel.URL, deployModel.URL)
- if err != nil {
- status.Status = string(sednav1.ILJobStageCondFailed)
- klog.Errorf("failed to update model for job(%s): %v", jobConfig.UniqueIdentifier, err)
- } else {
- status.Status = string(sednav1.ILJobStageCondReady)
- klog.Infof("update model for job(%s) successfully", jobConfig.UniqueIdentifier)
- }
-
- evalModel := job.JobConfig.EvalModel
- if evalModel != nil {
- newEvalModel := im.getModelFromJobConditions(job, sednav1.ILJobEval)
- if err := im.updateDeployModelFile(job, newEvalModel.URL, evalModel.URL); err != nil {
- return err
- }
- }
- } else {
- status.Status = string(sednav1.ILJobStageCondReady)
- }
-
- status.Input = &clienttypes.Input{
- Models: models,
- }
-
- klog.Infof("job(%s) completed the %sing phase triggering task successfully",
- jobConfig.UniqueIdentifier, sednav1.ILJobDeploy)
- } else {
- // No need to deploy, just report completed status
- // TODO: instead of reporting deploy-completed, another more reasonable status
- klog.Infof("job(%s) isn't need to deploy model", jobConfig.UniqueIdentifier)
- status.Status = string(sednav1.ILJobStageCondCompleted)
- }
-
- err = im.Client.WriteMessage(status, job.getHeader())
- if err != nil {
- klog.Errorf("job(%s) completed the %s task failed: %v",
- jobConfig.UniqueIdentifier, sednav1.ILJobDeploy, err)
- }
-
- job.JobConfig.DeployTriggerStatus = TriggerCompletedStatus
- }
-
- return nil
- }
-
- // startJob starts a job
- func (im *Manager) startJob(name string) {
- var err error
- job := im.IncrementalJobMap[name]
-
- err = im.initJob(job, name)
- if err != nil {
- klog.Errorf("failed to init job (name=%s): %+v", name)
- return
- }
-
- klog.Infof("incremental job(%s) was started", name)
- defer klog.Infof("incremental learning job(%s) was stopped", name)
-
- // handle data from dataset
- go im.handleData(job)
-
- tick := time.NewTicker(JobIterationIntervalSeconds * time.Second)
- for {
- select {
- case <-job.JobConfig.Done:
- return
- default:
- }
-
- cond := im.getLatestCondition(job)
- jobStage := cond.Stage
-
- switch jobStage {
- case sednav1.ILJobTrain:
- err = im.trainTask(job)
- case sednav1.ILJobEval:
- err = im.evalTask(job)
- case sednav1.ILJobDeploy:
- if cond.Type == sednav1.ILJobStageCondWaiting {
- err = im.deployTask(job)
- } else if cond.Type == sednav1.ILJobStageCondRunning && job.Spec.DeploySpec.Model.HotUpdateEnabled {
- err = im.hotModelUpdateDeployTask(job)
- }
- default:
- klog.Errorf("invalid phase: %s", jobStage)
- continue
- }
-
- if err != nil {
- klog.Errorf("job(%s) failed to complete the %s task: %v", name, jobStage, err)
- }
-
- <-tick.C
- }
- }
-
- // Insert inserts incremental-learning-job config to db
- func (im *Manager) Insert(message *clienttypes.Message) error {
- name := util.GetUniqueIdentifier(message.Header.Namespace, message.Header.ResourceName, message.Header.ResourceKind)
-
- first := false
- job, ok := im.IncrementalJobMap[name]
- if !ok {
- job = &Job{}
- im.IncrementalJobMap[name] = job
- first = true
- }
-
- if err := json.Unmarshal(message.Content, &job); err != nil {
- return err
- }
-
- if err := db.SaveResource(name, job.TypeMeta, job.ObjectMeta, job.Spec); err != nil {
- return err
- }
-
- if first {
- go im.startJob(name)
- }
-
- return nil
- }
-
- // deleteModelHotUpdateData deletes the local data of model hot update
- func (im *Manager) deleteModelHotUpdateData(job *Job) error {
- if configFile, ok := job.ObjectMeta.Annotations[runtime.ModelHotUpdateAnnotationsKey]; ok {
- localHostDir := filepath.Dir(configFile)
- dir := util.AddPrefixPath(im.VolumeMountPrefix, localHostDir)
- if err := os.RemoveAll(dir); err != nil {
- return fmt.Errorf("failed to delete the dir(%s): %w", dir, err)
- }
- }
-
- return nil
- }
-
- // Delete deletes incremental-learning-job config in db
- func (im *Manager) Delete(message *clienttypes.Message) error {
- name := util.GetUniqueIdentifier(message.Header.Namespace, message.Header.ResourceName, message.Header.ResourceKind)
-
- if job, ok := im.IncrementalJobMap[name]; ok && job.JobConfig.Done != nil {
- close(job.JobConfig.Done)
-
- if err := im.deleteModelHotUpdateData(job); err != nil {
- klog.Errorf("job(%s) failed to delete data of model hot update: %v", name, err)
- // continue anyway
- }
- }
-
- delete(im.IncrementalJobMap, name)
-
- if err := db.DeleteResource(name); err != nil {
- return err
- }
-
- return nil
- }
-
- // updateJobFromDB updates job from db
- func (im *Manager) updateJobFromDB(job *Job) error {
- var err error
-
- previousJob, err := db.GetResource(job.JobConfig.UniqueIdentifier)
- if err != nil {
- return err
- }
-
- m := metav1.ObjectMeta{}
- if err != json.Unmarshal([]byte(previousJob.ObjectMeta), &m) {
- return err
- }
-
- rounds, ok := m.Annotations[AnnotationsRoundsKey]
- if !ok {
- return nil
- }
-
- if job.JobConfig.Rounds, err = strconv.Atoi(rounds); err != nil {
- return err
- }
-
- numberOfSamples, ok := m.Annotations[AnnotationsNumberOfSamplesKey]
- if !ok {
- return nil
- }
-
- if job.JobConfig.DataSamples.PreviousNumbers, err = strconv.Atoi(numberOfSamples); err != nil {
- return err
- }
-
- dataFileOfEval, ok := m.Annotations[AnnotationsDataFileOfEvalKey]
- if !ok {
- return nil
- }
-
- localURL, err := job.JobConfig.Storage.Download(dataFileOfEval, "")
-
- if !job.JobConfig.Storage.IsLocalStorage {
- defer os.RemoveAll(localURL)
- }
-
- if err != nil {
- return err
- }
-
- samples, err := dataset.GetSamples(dataFileOfEval)
- if err != nil {
- klog.Errorf("read file %s failed: %v", dataFileOfEval, err)
- return err
- }
-
- job.JobConfig.DataSamples.EvalVersionSamples = append(job.JobConfig.DataSamples.EvalVersionSamples, samples)
-
- return nil
- }
-
- // saveJobToDB saves job info to db
- func (im *Manager) saveJobToDB(job *Job) error {
- ann := job.ObjectMeta.Annotations
- if ann == nil {
- ann = make(map[string]string)
- }
-
- ann[AnnotationsRoundsKey] = strconv.Itoa(job.JobConfig.Rounds)
- ann[AnnotationsNumberOfSamplesKey] = strconv.Itoa(job.JobConfig.DataSamples.PreviousNumbers)
- ann[AnnotationsDataFileOfEvalKey] = job.JobConfig.EvalDataURL
-
- return db.SaveResource(job.JobConfig.UniqueIdentifier, job.TypeMeta, job.ObjectMeta, job.Spec)
- }
-
- // initJob inits the job object
- func (im *Manager) initJob(job *Job, name string) error {
- job.JobConfig = new(JobConfig)
-
- jobConfig := job.JobConfig
- jobConfig.UniqueIdentifier = name
-
- jobConfig.Storage = storage.Storage{IsLocalStorage: false}
- credential := job.ObjectMeta.Annotations[runtime.SecretAnnotationKey]
- if credential != "" {
- if err := job.JobConfig.Storage.SetCredential(credential); err != nil {
- return fmt.Errorf("failed to set storage credential: %w", err)
- }
- }
-
- jobConfig.Done = make(chan struct{})
- jobConfig.Lock = sync.Mutex{}
- jobConfig.Rounds = 0
-
- jobConfig.DataSamples = &DataSamples{
- PreviousNumbers: 0,
- TrainSamples: make([]string, 0),
- EvalVersionSamples: make([][]string, 0),
- EvalSamples: make([]string, 0),
- }
-
- trainTrigger, err := newTrigger(job.Spec.TrainSpec.Trigger)
- if err != nil {
- return fmt.Errorf("failed to init train trigger: %+w", err)
- }
- deployTrigger, err := newTrigger(job.Spec.DeploySpec.Trigger)
- if err != nil {
- return fmt.Errorf("failed to init deploy trigger: %+w", err)
- }
- jobConfig.TrainTrigger = trainTrigger
- jobConfig.DeployTrigger = deployTrigger
-
- outputDir := job.Spec.OutputDir
-
- isLocalURL, err := jobConfig.Storage.IsLocalURL(outputDir)
- if err != nil {
- return fmt.Errorf("job(%s)'s output dir(%s) is invalid: %+w", job.Name, outputDir, err)
- }
-
- if isLocalURL {
- jobConfig.Storage.IsLocalStorage = true
- outputDir = util.AddPrefixPath(im.VolumeMountPrefix, outputDir)
- }
-
- jobConfig.OutputDir = outputDir
-
- if err := job.createOutputDir(jobConfig); err != nil {
- return err
- }
-
- if err := im.updateJobFromDB(job); err != nil {
- klog.Errorf("job(%s) failed to update job from db: %v", name, err)
- }
-
- initTriggerStatus(jobConfig)
-
- return nil
- }
-
- func initTriggerStatus(jobConfig *JobConfig) {
- jobConfig.TrainTriggerStatus = TriggerReadyStatus
- jobConfig.EvalTriggerStatus = TriggerReadyStatus
- jobConfig.DeployTriggerStatus = TriggerReadyStatus
- jobConfig.HotModelUpdateDeployTriggerStatus = TriggerReadyStatus
- }
-
- func newTrigger(t sednav1.Trigger) (trigger.Base, error) {
- // convert trigger to map
- triggerMap := make(map[string]interface{})
- c, err := json.Marshal(t)
- if err != nil {
- return nil, err
- }
-
- err = json.Unmarshal(c, &triggerMap)
- if err != nil {
- return nil, err
- }
- return trigger.NewTrigger(triggerMap)
- }
-
- // getModelsFromJobConditions gets models from job condition
- func (im *Manager) getModelsFromJobConditions(jobConditions []sednav1.ILJobCondition, stage sednav1.ILJobStage, currentType sednav1.ILJobStageConditionType, dataType string) []Model {
- // TODO: runtime.type changes to common.type for gm and lc
- for i := len(jobConditions) - 1; i >= 0; i-- {
- var cond gmtypes.IncrementalCondData
- jobCond := jobConditions[i]
- if jobCond.Stage == stage && jobCond.Type == currentType {
- if err := (&cond).Unmarshal([]byte(jobCond.Data)); err != nil {
- continue
- }
-
- if dataType == "input" {
- if cond.Input == nil {
- continue
- }
-
- return cond.Input.Models
- } else if dataType == "output" {
- if cond.Output == nil {
- continue
- }
-
- return cond.Output.Models
- }
- }
- }
-
- return nil
- }
-
- // getEvalResult gets eval result from job conditions
- func (im *Manager) getEvalResult(job *Job) ([]map[string][]float64, error) {
- jobConditions := job.Status.Conditions
- models := im.getModelsFromJobConditions(jobConditions, sednav1.ILJobEval, sednav1.ILJobStageCondCompleted, "output")
-
- var result []map[string][]float64
- var err error
- for _, m := range models {
- bytes, err := json.Marshal(m.Metrics)
- if err != nil {
- return nil, err
- }
-
- data := make(map[string][]float64)
- if err = json.Unmarshal(bytes, &data); err != nil {
- return nil, err
- }
-
- result = append(result, data)
- }
- return result, err
- }
-
- // getModelFromJobConditions gets model from job conditions for train/eval/deploy
- func (im *Manager) getModelFromJobConditions(job *Job, jobStage sednav1.ILJobStage) *Model {
- jobConditions := job.Status.Conditions
- jobConfig := job.JobConfig
-
- getModel := func(initModel *Model, models []Model) *Model {
- for _, m := range models {
- if m.Format == initModel.Format {
- if initModel.Devices != nil && len(m.Devices) == 1 {
- for _, d := range initModel.Devices {
- if m.Devices[0] == d {
- return &m
- }
- }
- } else {
- return &m
- }
- }
- }
-
- return nil
- }
-
- models := im.getModelsFromJobConditions(jobConditions, sednav1.ILJobTrain, sednav1.ILJobStageCondCompleted, "output")
- if models == nil {
- return nil
- }
-
- var model *Model
- switch jobStage {
- case sednav1.ILJobTrain:
- model = jobConfig.TrainModel
- case sednav1.ILJobEval:
- model = jobConfig.EvalModel
- case sednav1.ILJobDeploy:
- model = jobConfig.DeployModel
- }
- if model == nil {
- return nil
- }
-
- return getModel(model, models)
- }
-
- // triggerTrainTask triggers the train task
- func (im *Manager) triggerTrainTask(job *Job) (interface{}, bool, error) {
- var err error
- jobConfig := job.JobConfig
-
- const numOfSamples = "num_of_samples"
- samples := map[string]interface{}{
- numOfSamples: len(jobConfig.DataSamples.TrainSamples),
- }
-
- isTrigger := jobConfig.TrainTrigger.Trigger(samples)
-
- if !isTrigger {
- return nil, false, nil
- }
-
- job.JobConfig.Rounds++
-
- var m *Model
- rounds := jobConfig.Rounds
- if rounds <= 1 {
- m = jobConfig.TrainModel
- } else {
- m = im.getModelFromJobConditions(job, sednav1.ILJobTrain)
- }
-
- var dataIndexURL string
- jobConfig.TrainDataURL, dataIndexURL, err = im.writeSamples(job, jobConfig.DataSamples.TrainSamples,
- jobConfig.OutputConfig.SamplesOutput["train"], rounds, jobConfig.Dataset.Spec.Format, jobConfig.Dataset.URLPrefix)
- if err != nil {
- job.JobConfig.Rounds--
- klog.Errorf("job(%s) train phase: write samples to the file(%s) is failed: %v",
- jobConfig.UniqueIdentifier, jobConfig.TrainDataURL, err)
- return nil, false, err
- }
-
- dataURL := jobConfig.TrainDataURL
- outputDir := strings.Join([]string{jobConfig.OutputConfig.TrainOutput, strconv.Itoa(rounds)}, "/")
- if jobConfig.Storage.IsLocalStorage {
- dataURL = util.TrimPrefixPath(im.VolumeMountPrefix, dataURL)
- dataIndexURL = util.TrimPrefixPath(im.VolumeMountPrefix, dataIndexURL)
- outputDir = util.TrimPrefixPath(im.VolumeMountPrefix, outputDir)
- }
-
- input := clienttypes.Input{
- Models: []Model{*m},
- DataURL: dataURL,
- DataIndexURL: dataIndexURL,
- OutputDir: outputDir,
- }
- msg := clienttypes.UpstreamMessage{
- Phase: string(sednav1.ILJobTrain),
- Status: string(sednav1.ILJobStageCondReady),
- Input: &input,
- }
-
- jobConfig.TriggerTime = time.Now()
- return &msg, true, nil
- }
-
- // triggerEvalTask triggers the eval task
- func (im *Manager) triggerEvalTask(job *Job) (*clienttypes.UpstreamMessage, error) {
- jobConfig := job.JobConfig
- var err error
-
- m := im.getModelFromJobConditions(job, sednav1.ILJobEval)
-
- var models []Model
- models = append(models, *m, *jobConfig.EvalModel)
-
- var dataIndexURL string
- jobConfig.EvalDataURL, dataIndexURL, err = im.writeSamples(job, jobConfig.DataSamples.EvalSamples, jobConfig.OutputConfig.SamplesOutput["eval"],
- job.JobConfig.Rounds, jobConfig.Dataset.Spec.Format, jobConfig.Dataset.URLPrefix)
- if err != nil {
- klog.Errorf("job(%s) eval phase: write samples to the file(%s) is failed: %v",
- jobConfig.UniqueIdentifier, jobConfig.EvalDataURL, err)
- return nil, err
- }
- jobConfig.DataSamples.EvalSamples = []string{}
-
- dataURL := jobConfig.EvalDataURL
- if jobConfig.Storage.IsLocalStorage {
- dataURL = util.TrimPrefixPath(im.VolumeMountPrefix, dataURL)
- dataIndexURL = util.TrimPrefixPath(im.VolumeMountPrefix, dataIndexURL)
- }
-
- input := clienttypes.Input{
- Models: models,
- DataURL: dataURL,
- DataIndexURL: dataIndexURL,
- }
- msg := &clienttypes.UpstreamMessage{
- Phase: string(sednav1.ILJobEval),
- Status: string(sednav1.ILJobStageCondReady),
- Input: &input,
- }
-
- return msg, nil
- }
-
- // triggerDeployTask triggers the deploy task
- func (im *Manager) triggerDeployTask(job *Job) (bool, error) {
- jobConfig := job.JobConfig
-
- evalResult, err := im.getEvalResult(job)
- // EvalResult must has two models info, first is trained model, second is deployed model.
- if len(evalResult) != 2 {
- return false, fmt.Errorf("expected 2 evaluation results, actual: %d", len(jobConfig.EvalResult))
- }
-
- newMetrics := evalResult[0]
- oldMetrics := evalResult[1]
- metricDelta := make(map[string]interface{})
-
- for metric := range newMetrics {
- // keep the full metrics
- metricDelta[metric] = newMetrics[metric]
- var l []float64
- for i := range newMetrics[metric] {
- l = append(l, newMetrics[metric][i]-oldMetrics[metric][i])
- }
- metricDelta[metric+"_delta"] = l
- }
- tt := job.Spec.DeploySpec.Trigger
-
- // convert tt to map
- triggerMap := make(map[string]interface{})
- c, err := json.Marshal(tt)
- if err != nil {
- return false, err
- }
-
- err = json.Unmarshal(c, &triggerMap)
- if err != nil {
- return false, err
- }
-
- return jobConfig.DeployTrigger.Trigger(metricDelta), nil
- }
-
- // updateDeployModelFile updates deploy model file
- func (im *Manager) updateDeployModelFile(job *Job, trainedModel string, deployModel string) error {
- if job.JobConfig.Storage.IsLocalStorage {
- trainedModel = util.AddPrefixPath(im.VolumeMountPrefix, trainedModel)
- }
-
- if err := job.JobConfig.Storage.CopyFile(trainedModel, deployModel); err != nil {
- return fmt.Errorf("failed to copy trained model(url=%s) to the deploy model(url=%s): %w",
- trainedModel, deployModel, err)
- }
-
- klog.V(4).Infof("copy trained model(url=%s) to the deploy model(url=%s) successfully", trainedModel, deployModel)
-
- return nil
- }
-
- // createOutputDir creates the job output dir
- func (job *Job) createOutputDir(jobConfig *JobConfig) error {
- outputDir := jobConfig.OutputDir
-
- dirNames := []string{"data/train", "data/eval", "train", "eval"}
-
- if job.JobConfig.Storage.IsLocalStorage {
- if err := util.CreateFolder(outputDir); err != nil {
- klog.Errorf("job(%s) failed to create folder %s: %v", jobConfig.UniqueIdentifier, outputDir, err)
- return err
- }
-
- for _, v := range dirNames {
- dir := path.Join(outputDir, v)
- if err := util.CreateFolder(dir); err != nil {
- klog.Errorf("job(%s) failed to create folder %s: %v", jobConfig.UniqueIdentifier, dir, err)
- return err
- }
- }
- }
-
- outputConfig := OutputConfig{
- SamplesOutput: map[string]string{
- "train": strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[0]}, "/"),
- "eval": strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[1]}, "/"),
- },
- TrainOutput: strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[2]}, "/"),
- EvalOutput: strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[3]}, "/"),
- }
- jobConfig.OutputConfig = &outputConfig
-
- return nil
- }
-
- func (im *Manager) getLatestCondition(job *Job) sednav1.ILJobCondition {
- jobConditions := job.Status.Conditions
- var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{}
- if len(jobConditions) > 0 {
- // get latest pod and pod status
- latestCondition = jobConditions[len(jobConditions)-1]
- }
- return latestCondition
- }
-
- func (im *Manager) getModel(namespace string, name string) (sednav1.Model, error) {
- modelName := util.GetUniqueIdentifier(namespace, name, model.KindName)
- model, ok := im.ModelManager.GetModel(modelName)
- if !ok {
- return model, fmt.Errorf("not exists model(name=%s)", modelName)
- }
- return model, nil
- }
-
- // loadTrainModel loads initial model information for training.
- func (im *Manager) loadTrainModel(job *Job) error {
- jobConfig := job.JobConfig
-
- if jobConfig.TrainModel == nil {
- initialModel, err := im.getModel(job.Namespace, job.Spec.InitialModel.Name)
- if err != nil {
- return err
- }
-
- jobConfig.TrainModel = new(Model)
- jobConfig.TrainModel.Format = initialModel.Spec.Format
- jobConfig.TrainModel.URL = initialModel.Spec.URL
- jobConfig.TrainModel.Devices = initialModel.Spec.Devices
- }
- return nil
- }
-
- // loadEvalModel loads initial model information for eval.
- func (im *Manager) loadEvalModel(job *Job) error {
- jobConfig := job.JobConfig
-
- if jobConfig.EvalModel == nil {
- initialModel, err := im.getModel(job.Namespace, job.Spec.EvalSpec.InitialModel.Name)
- if err != nil {
- return err
- }
-
- jobConfig.EvalModel = new(Model)
- jobConfig.EvalModel.Format = initialModel.Spec.Format
- jobConfig.EvalModel.URL = initialModel.Spec.URL
- jobConfig.EvalModel.Devices = initialModel.Spec.Devices
- }
- return nil
- }
-
- // loadDeployModel loads model information for deploying.
- func (im *Manager) loadDeployModel(job *Job) error {
- jobConfig := job.JobConfig
-
- if jobConfig.DeployModel == nil {
- deployModel, err := im.getModel(job.Namespace, job.Spec.DeploySpec.Model.Name)
- if err != nil {
- return err
- }
-
- jobConfig.DeployModel = new(Model)
- jobConfig.DeployModel.Format = deployModel.Spec.Format
- jobConfig.DeployModel.URL = deployModel.Spec.URL
- jobConfig.DeployModel.Devices = deployModel.Spec.Devices
- }
- return nil
- }
-
- // loadDataset loads dataset information
- func (im *Manager) loadDataset(job *Job) error {
- if job.JobConfig.Dataset != nil {
- // already loaded
- return nil
- }
-
- datasetName := util.GetUniqueIdentifier(job.Namespace, job.Spec.Dataset.Name, dataset.KindName)
- dataset, ok := im.DatasetManager.GetDataset(datasetName)
- if !ok || dataset == nil {
- return fmt.Errorf("not exists dataset(name=%s)", datasetName)
- }
-
- job.JobConfig.Dataset = dataset
- return nil
- }
-
- // handleData updates samples information
- func (im *Manager) handleData(job *Job) {
- tick := time.NewTicker(DatasetHandlerIntervalSeconds * time.Second)
-
- jobConfig := job.JobConfig
- iterCount := 0
- for {
- select {
- case <-jobConfig.Done:
- return
- default:
- }
-
- if iterCount%100 == 0 {
- klog.V(4).Infof("job(%s) is handling dataset", jobConfig.UniqueIdentifier)
- }
- iterCount++
-
- if jobConfig.Dataset == nil || jobConfig.Dataset.DataSource == nil {
- // already loaded dataset
- <-tick.C
- continue
- }
-
- dataset := jobConfig.Dataset
- currentNumberOfSamples := dataset.DataSource.NumberOfSamples
- previousNumberOfSamples := jobConfig.DataSamples.PreviousNumbers
-
- if dataset.DataSource != nil && currentNumberOfSamples > previousNumberOfSamples {
- samples := dataset.DataSource.TrainSamples
- newNumberOfSamples := currentNumberOfSamples - previousNumberOfSamples
- trainNum := int(job.Spec.Dataset.TrainProb * float64(newNumberOfSamples))
-
- jobConfig.Lock.Lock()
- jobConfig.DataSamples.TrainSamples = append(jobConfig.DataSamples.TrainSamples,
- samples[previousNumberOfSamples:previousNumberOfSamples+trainNum]...)
- klog.Infof("job(%s)'s current train samples nums is %d", jobConfig.UniqueIdentifier, trainNum)
-
- jobConfig.DataSamples.EvalVersionSamples = append(jobConfig.DataSamples.EvalVersionSamples,
- samples[previousNumberOfSamples+trainNum:])
- jobConfig.Lock.Unlock()
-
- for _, v := range jobConfig.DataSamples.EvalVersionSamples {
- jobConfig.DataSamples.EvalSamples = append(jobConfig.DataSamples.EvalSamples, v...)
- }
- klog.Infof("job(%s)'s current eval samples nums is %d", jobConfig.UniqueIdentifier, len(jobConfig.DataSamples.EvalSamples))
-
- jobConfig.DataSamples.PreviousNumbers = currentNumberOfSamples
- }
-
- <-tick.C
- }
- }
-
- // createFile creates data file and data index file
- func createFile(dir string, format string, isLocalStorage bool) (string, string) {
- switch format {
- case dataset.TXTFormat:
- if isLocalStorage {
- return path.Join(dir, "data.txt"), ""
- }
- return strings.Join([]string{dir, "data.txt"}, "/"), strings.Join([]string{dir, "dataIndex.txt"}, "/")
- }
- return "", ""
- }
-
- // writeSamples writes samples information to a file
- func (im *Manager) writeSamples(job *Job, samples []string, dir string, rounds int, format string, urlPrefix string) (string, string, error) {
- if samples == nil {
- return "", "", fmt.Errorf("not samples")
- }
-
- jobConfig := job.JobConfig
- subDir := strings.Join([]string{dir, strconv.Itoa(rounds)}, "/")
- fileURL, absURLFile := createFile(subDir, format, jobConfig.Dataset.Storage.IsLocalStorage)
-
- if jobConfig.Storage.IsLocalStorage {
- if err := util.CreateFolder(subDir); err != nil {
- return "", "", err
- }
- if err := im.writeByLine(samples, fileURL); err != nil {
- return "", "", err
- }
-
- if !jobConfig.Dataset.Storage.IsLocalStorage {
- tempSamples := util.ParsingDatasetIndex(samples, urlPrefix)
- if err := im.writeByLine(tempSamples, absURLFile); err != nil {
- return "", "", err
- }
- }
-
- return fileURL, absURLFile, nil
- }
-
- temporaryDir, err := util.CreateTemporaryDir()
- if err != nil {
- return "", "", err
- }
-
- localFileURL, localAbsURLFile := createFile(temporaryDir, format, jobConfig.Dataset.Storage.IsLocalStorage)
-
- if err := im.writeByLine(samples, localFileURL); err != nil {
- return "", "", err
- }
-
- if err := jobConfig.Storage.Upload(localFileURL, fileURL); err != nil {
- return "", "", err
- }
-
- tempSamples := util.ParsingDatasetIndex(samples, urlPrefix)
-
- if err := im.writeByLine(tempSamples, localAbsURLFile); err != nil {
- return "", "", err
- }
-
- if err := jobConfig.Storage.Upload(localAbsURLFile, absURLFile); err != nil {
- return "", "", err
- }
-
- defer os.RemoveAll(localFileURL)
- defer os.RemoveAll(localAbsURLFile)
-
- return fileURL, absURLFile, nil
- }
-
- // writeByLine writes file by line
- func (im *Manager) writeByLine(samples []string, fileURL string) error {
- file, err := os.Create(fileURL)
- if err != nil {
- klog.Errorf("create file(%s) failed", fileURL)
- return err
- }
-
- w := bufio.NewWriter(file)
- for _, line := range samples {
- _, _ = fmt.Fprintln(w, line)
- }
- if err := w.Flush(); err != nil {
- klog.Errorf("failed to write file(%s): %v", fileURL, err)
- return err
- }
-
- if err := file.Close(); err != nil {
- klog.Errorf("failed to close file(%s): %v", fileURL, err)
- return err
- }
-
- return nil
- }
-
- // monitorWorker monitors message from worker
- func (im *Manager) monitorWorker() {
- for {
- workerMessageChannel := im.WorkerMessageChannel
- workerMessage, ok := <-workerMessageChannel
- if !ok {
- break
- }
- klog.V(4).Infof("handling worker message %+v", workerMessage)
-
- name := util.GetUniqueIdentifier(workerMessage.Namespace, workerMessage.OwnerName, workerMessage.OwnerKind)
-
- job, ok := im.IncrementalJobMap[name]
- if !ok {
- continue
- }
-
- // TODO: filter some worker messages out
- wo := clienttypes.Output{}
- wo.Models = workerMessage.Results
- wo.OwnerInfo = workerMessage.OwnerInfo
-
- msg := &clienttypes.UpstreamMessage{
- Phase: workerMessage.Kind,
- Status: workerMessage.Status,
- Output: &wo,
- }
-
- if err := im.Client.WriteMessage(msg, job.getHeader()); err != nil {
- klog.Errorf("job(%s) failed to write message: %v", name, err)
- continue
- }
- }
- }
-
- // forwardSamples deletes the samples information in the memory
- func forwardSamples(jobConfig *JobConfig, jobStage sednav1.ILJobStage) {
- switch jobStage {
- case sednav1.ILJobTrain:
- jobConfig.Lock.Lock()
- jobConfig.DataSamples.TrainSamples = jobConfig.DataSamples.TrainSamples[:0]
- jobConfig.Lock.Unlock()
- case sednav1.ILJobEval:
- if len(jobConfig.DataSamples.EvalVersionSamples) > EvalSamplesCapacity {
- jobConfig.DataSamples.EvalVersionSamples = jobConfig.DataSamples.EvalVersionSamples[1:]
- }
- }
- }
-
- // AddWorkerMessage adds worker messages
- func (im *Manager) AddWorkerMessage(message workertypes.MessageContent) {
- im.WorkerMessageChannel <- message
- }
-
- // GetName returns name of the manager
- func (im *Manager) GetName() string {
- return KindName
- }
-
- func (job *Job) getHeader() clienttypes.MessageHeader {
- return clienttypes.MessageHeader{
- Namespace: job.Namespace,
- ResourceKind: job.Kind,
- ResourceName: job.Name,
- Operation: clienttypes.StatusOperation,
- }
- }
|