You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

lifelonglearningjob.go 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. /*
  2. Copyright 2021 The KubeEdge Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package lifelonglearning
  14. import (
  15. "bufio"
  16. "encoding/json"
  17. "fmt"
  18. "os"
  19. "path"
  20. "strconv"
  21. "strings"
  22. "sync"
  23. "time"
  24. "k8s.io/klog/v2"
  25. "github.com/kubeedge/sedna/cmd/sedna-lc/app/options"
  26. sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
  27. "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
  28. "github.com/kubeedge/sedna/pkg/localcontroller/db"
  29. clienttypes "github.com/kubeedge/sedna/pkg/localcontroller/gmclient"
  30. "github.com/kubeedge/sedna/pkg/localcontroller/managers/dataset"
  31. "github.com/kubeedge/sedna/pkg/localcontroller/storage"
  32. "github.com/kubeedge/sedna/pkg/localcontroller/trigger"
  33. "github.com/kubeedge/sedna/pkg/localcontroller/util"
  34. workertypes "github.com/kubeedge/sedna/pkg/localcontroller/worker"
  35. )
  36. const (
  37. //KindName is kind of lifelong-learning-job resource
  38. KindName = "lifelonglearningjob"
  39. // TrainPhase is the train phase
  40. TrainPhase = "train"
  41. // EvalPhase is the eval phase
  42. EvalPhase = "eval"
  43. // DeployPhase is the deploy phase
  44. DeployPhase = "deploy"
  45. // TriggerReadyStatus is the ready status about trigger
  46. TriggerReadyStatus = "ready"
  47. // TriggerCompletedStatus is the completed status about trigger
  48. TriggerCompletedStatus = "completed"
  49. )
  50. // LifelongLearningJobManager defines lifelong-learning-job Manager
  51. type Manager struct {
  52. Client clienttypes.ClientI
  53. WorkerMessageChannel chan workertypes.MessageContent
  54. DatasetManager *dataset.Manager
  55. LifelongLearningJobMap map[string]*Job
  56. VolumeMountPrefix string
  57. }
  58. // LifelongLearningJob defines config for lifelong-learning-job
  59. type Job struct {
  60. sednav1.LifelongLearningJob
  61. Dataset *dataset.Dataset
  62. Done chan struct{}
  63. Storage storage.Storage
  64. JobConfig *LLJobConfig
  65. }
  66. // LLJobConfig defines config for lifelong-learning-job
  67. type LLJobConfig struct {
  68. UniqueIdentifier string
  69. Version int
  70. Phase string
  71. WorkerStatus string
  72. TrainTrigger trigger.Base
  73. TriggerStatus string
  74. TriggerTime time.Time
  75. TrainDataURL string
  76. EvalDataURL string
  77. OutputDir string
  78. OutputConfig *LLOutputConfig
  79. DataSamples *LLDataSamples
  80. TrainModel *Model
  81. DeployModel *Model
  82. EvalResult *Model
  83. Lock sync.Mutex
  84. }
  85. type Model = clienttypes.Model
  86. // LLOutputConfig defines config for job output
  87. type LLOutputConfig struct {
  88. SamplesOutput map[string]string
  89. TrainOutput string
  90. EvalOutput string
  91. }
  92. // LLDataSamples defines samples information
  93. type LLDataSamples struct {
  94. Numbers int
  95. TrainSamples []string
  96. EvalVersionSamples [][]string
  97. EvalSamples []string
  98. }
  99. const (
  100. // LLJobIterationIntervalSeconds is interval time of each iteration of job
  101. LLJobIterationIntervalSeconds = 10
  102. // LLHandlerDataIntervalSeconds is interval time of handling dataset
  103. LLHandlerDataIntervalSeconds = 10
  104. // LLLLEvalSamplesCapacity is capacity of eval samples
  105. LLEvalSamplesCapacity = 5
  106. )
  107. // New creates a lifelong-learning-job manager
  108. func New(client clienttypes.ClientI, datasetManager *dataset.Manager, options *options.LocalControllerOptions) *Manager {
  109. lm := Manager{
  110. Client: client,
  111. WorkerMessageChannel: make(chan workertypes.MessageContent, workertypes.MessageChannelCacheSize),
  112. DatasetManager: datasetManager,
  113. LifelongLearningJobMap: make(map[string]*Job),
  114. VolumeMountPrefix: options.VolumeMountPrefix,
  115. }
  116. return &lm
  117. }
  118. // Insert inserts lifelong-learning-job config to db
  119. func (lm *Manager) Insert(message *clienttypes.Message) error {
  120. name := util.GetUniqueIdentifier(message.Header.Namespace, message.Header.ResourceName, message.Header.ResourceKind)
  121. first := false
  122. job, ok := lm.LifelongLearningJobMap[name]
  123. if !ok {
  124. job = &Job{}
  125. job.Storage = storage.Storage{IsLocalStorage: false}
  126. job.Done = make(chan struct{})
  127. lm.LifelongLearningJobMap[name] = job
  128. first = true
  129. }
  130. if err := json.Unmarshal(message.Content, &job); err != nil {
  131. return err
  132. }
  133. credential := job.ObjectMeta.Annotations[runtime.SecretAnnotationKey]
  134. if credential != "" {
  135. if err := job.Storage.SetCredential(credential); err != nil {
  136. return fmt.Errorf("failed to set job(name=%s)'s storage credential, error: %+v", name, err)
  137. }
  138. }
  139. if first {
  140. go lm.startJob(name)
  141. }
  142. if err := db.SaveResource(name, job.TypeMeta, job.ObjectMeta, job.Spec); err != nil {
  143. return err
  144. }
  145. return nil
  146. }
  147. // startJob starts a job
  148. func (lm *Manager) startJob(name string) {
  149. var err error
  150. job, ok := lm.LifelongLearningJobMap[name]
  151. if !ok {
  152. return
  153. }
  154. job.JobConfig = new(LLJobConfig)
  155. jobConfig := job.JobConfig
  156. jobConfig.UniqueIdentifier = name
  157. err = lm.initJob(job)
  158. if err != nil {
  159. klog.Errorf("failed to init job (name=%s): %+v", jobConfig.UniqueIdentifier)
  160. return
  161. }
  162. klog.Infof("lifelong learning job(name=%s) is started", name)
  163. defer klog.Infof("lifelong learning job(name=%s) is stopped", name)
  164. go lm.handleData(job)
  165. tick := time.NewTicker(LLJobIterationIntervalSeconds * time.Second)
  166. for {
  167. select {
  168. case <-job.Done:
  169. return
  170. default:
  171. }
  172. if job.Dataset == nil {
  173. klog.V(3).Infof("job(name=%s) dataset not ready",
  174. jobConfig.UniqueIdentifier)
  175. <-tick.C
  176. continue
  177. }
  178. switch jobConfig.Phase {
  179. case TrainPhase:
  180. err = lm.trainTask(job)
  181. case EvalPhase:
  182. err = lm.evalTask(job)
  183. case DeployPhase:
  184. err = lm.deployTask(job)
  185. default:
  186. klog.Errorf("invalid phase: %s", jobConfig.Phase)
  187. continue
  188. }
  189. if err != nil {
  190. klog.Errorf("job(name=%s) complete the %s task failed, error: %v",
  191. jobConfig.UniqueIdentifier, jobConfig.Phase, err)
  192. }
  193. <-tick.C
  194. }
  195. }
  196. // trainTask starts training task
  197. func (lm *Manager) trainTask(job *Job) error {
  198. jobConfig := job.JobConfig
  199. if jobConfig.WorkerStatus == workertypes.ReadyStatus && jobConfig.TriggerStatus == TriggerReadyStatus {
  200. payload, ok, err := lm.triggerTrainTask(job)
  201. if !ok {
  202. return nil
  203. }
  204. if err != nil {
  205. klog.Errorf("job(name=%s) complete the %sing phase triggering task failed, error: %v",
  206. jobConfig.UniqueIdentifier, jobConfig.Phase, err)
  207. return err
  208. }
  209. err = lm.Client.WriteMessage(payload, job.getHeader())
  210. if err != nil {
  211. klog.Errorf("job(name=%s) failed to write message: %v",
  212. jobConfig.UniqueIdentifier, err)
  213. return err
  214. }
  215. jobConfig.TriggerStatus = TriggerCompletedStatus
  216. klog.Infof("job(name=%s) complete the %sing phase triggering task successfully",
  217. jobConfig.UniqueIdentifier, jobConfig.Phase)
  218. }
  219. if jobConfig.WorkerStatus == workertypes.FailedStatus {
  220. klog.Warningf("found the %sing phase worker that ran failed, "+
  221. "back the training phase triggering task", jobConfig.Phase)
  222. backLLTaskStatus(jobConfig)
  223. }
  224. if jobConfig.WorkerStatus == workertypes.CompletedStatus {
  225. klog.Infof("job(name=%s) complete the %s task successfully", jobConfig.UniqueIdentifier, jobConfig.Phase)
  226. nextLLTask(jobConfig)
  227. }
  228. return nil
  229. }
  230. // evalTask starts eval task
  231. func (lm *Manager) evalTask(job *Job) error {
  232. jobConfig := job.JobConfig
  233. if jobConfig.WorkerStatus == workertypes.ReadyStatus && jobConfig.TriggerStatus == TriggerReadyStatus {
  234. payload, err := lm.triggerEvalTask(job)
  235. if err != nil {
  236. klog.Errorf("job(name=%s) complete the %sing phase triggering task failed, error: %v",
  237. jobConfig.UniqueIdentifier, jobConfig.Phase, err)
  238. return err
  239. }
  240. err = lm.Client.WriteMessage(payload, job.getHeader())
  241. if err != nil {
  242. return err
  243. }
  244. jobConfig.TriggerStatus = TriggerCompletedStatus
  245. klog.Infof("job(name=%s) complete the %sing phase triggering task successfully",
  246. jobConfig.UniqueIdentifier, jobConfig.Phase)
  247. }
  248. if jobConfig.WorkerStatus == workertypes.FailedStatus {
  249. msg := fmt.Sprintf("job(name=%s) found the %sing phase worker that ran failed, "+
  250. "back the training phase triggering task", jobConfig.UniqueIdentifier, jobConfig.Phase)
  251. klog.Errorf(msg)
  252. return fmt.Errorf(msg)
  253. }
  254. if jobConfig.WorkerStatus == workertypes.CompletedStatus {
  255. klog.Infof("job(name=%s) complete the %s task successfully", jobConfig.UniqueIdentifier, jobConfig.Phase)
  256. nextLLTask(jobConfig)
  257. }
  258. return nil
  259. }
  260. // deployTask starts deploy task
  261. func (lm *Manager) deployTask(job *Job) error {
  262. jobConfig := job.JobConfig
  263. if jobConfig.WorkerStatus == workertypes.ReadyStatus && jobConfig.TriggerStatus == TriggerReadyStatus {
  264. status := clienttypes.UpstreamMessage{}
  265. status.Phase = DeployPhase
  266. deployModel, err := lm.deployModel(job)
  267. if err != nil {
  268. klog.Errorf("failed to deploy model for job(name=%s): %v", jobConfig.UniqueIdentifier, err)
  269. } else {
  270. klog.Infof("deployed model for job(name=%s) successfully", jobConfig.UniqueIdentifier)
  271. }
  272. if err != nil || deployModel == nil {
  273. status.Status = workertypes.FailedStatus
  274. } else {
  275. status.Status = workertypes.ReadyStatus
  276. status.Input = &clienttypes.Input{
  277. Models: []Model{
  278. *deployModel,
  279. },
  280. }
  281. }
  282. if err = lm.Client.WriteMessage(status, job.getHeader()); err != nil {
  283. return err
  284. }
  285. jobConfig.TriggerStatus = TriggerCompletedStatus
  286. }
  287. nextLLTask(jobConfig)
  288. klog.Infof("job(name=%s) complete the deploy task successfully", jobConfig.UniqueIdentifier)
  289. return nil
  290. }
  291. // triggerTrainTask triggers the train task
  292. func (lm *Manager) triggerTrainTask(job *Job) (interface{}, bool, error) {
  293. var err error
  294. jobConfig := job.JobConfig
  295. const numOfSamples = "num_of_samples"
  296. samples := map[string]interface{}{
  297. numOfSamples: len(jobConfig.DataSamples.TrainSamples),
  298. }
  299. isTrigger := jobConfig.TrainTrigger.Trigger(samples)
  300. if !isTrigger {
  301. return nil, false, nil
  302. }
  303. jobConfig.Version++
  304. var dataIndexURL string
  305. jobConfig.TrainDataURL, dataIndexURL, err = job.writeLLJSamples(jobConfig.DataSamples.TrainSamples,
  306. jobConfig.OutputConfig.SamplesOutput["train"])
  307. if err != nil {
  308. klog.Errorf("train phase: write samples to the file(%s) is failed, error: %v", jobConfig.TrainDataURL, err)
  309. return nil, false, err
  310. }
  311. dataURL := jobConfig.TrainDataURL
  312. outputDir := strings.Join([]string{jobConfig.OutputConfig.TrainOutput, strconv.Itoa(jobConfig.Version)}, "/")
  313. if job.Storage.IsLocalStorage {
  314. dataURL = util.TrimPrefixPath(lm.VolumeMountPrefix, dataURL)
  315. dataIndexURL = util.TrimPrefixPath(lm.VolumeMountPrefix, dataIndexURL)
  316. outputDir = util.TrimPrefixPath(lm.VolumeMountPrefix, outputDir)
  317. }
  318. input := clienttypes.Input{
  319. DataURL: dataURL,
  320. DataIndexURL: dataIndexURL,
  321. OutputDir: outputDir,
  322. }
  323. msg := clienttypes.UpstreamMessage{
  324. Phase: TrainPhase,
  325. Status: workertypes.ReadyStatus,
  326. Input: &input,
  327. }
  328. jobConfig.TriggerTime = time.Now()
  329. return &msg, true, nil
  330. }
  331. // triggerEvalTask triggers the eval task
  332. func (lm *Manager) triggerEvalTask(job *Job) (*clienttypes.UpstreamMessage, error) {
  333. jobConfig := job.JobConfig
  334. var err error
  335. var dataIndexURL string
  336. jobConfig.EvalDataURL, dataIndexURL, err = job.writeLLJSamples(jobConfig.DataSamples.EvalSamples, jobConfig.OutputConfig.SamplesOutput["eval"])
  337. if err != nil {
  338. klog.Errorf("job(name=%s) eval phase: write samples to the file(%s) is failed, error: %v",
  339. jobConfig.UniqueIdentifier, jobConfig.EvalDataURL, err)
  340. return nil, err
  341. }
  342. var models []Model
  343. models = append(models, Model{
  344. Format: jobConfig.TrainModel.Format,
  345. URL: jobConfig.TrainModel.URL,
  346. })
  347. dataURL := jobConfig.EvalDataURL
  348. outputDir := strings.Join([]string{jobConfig.OutputConfig.EvalOutput, strconv.Itoa(jobConfig.Version)}, "/")
  349. if job.Storage.IsLocalStorage {
  350. dataURL = util.TrimPrefixPath(lm.VolumeMountPrefix, dataURL)
  351. dataIndexURL = util.TrimPrefixPath(lm.VolumeMountPrefix, dataIndexURL)
  352. outputDir = util.TrimPrefixPath(lm.VolumeMountPrefix, outputDir)
  353. }
  354. input := clienttypes.Input{
  355. Models: models,
  356. DataURL: dataURL,
  357. DataIndexURL: dataIndexURL,
  358. OutputDir: outputDir,
  359. }
  360. msg := &clienttypes.UpstreamMessage{
  361. Phase: EvalPhase,
  362. Status: workertypes.ReadyStatus,
  363. Input: &input,
  364. }
  365. return msg, nil
  366. }
  367. // deployModel deploys model
  368. func (lm *Manager) deployModel(job *Job) (*Model, error) {
  369. jobConfig := job.JobConfig
  370. model := &Model{}
  371. model = jobConfig.EvalResult
  372. if job.Storage.IsLocalStorage {
  373. model.URL = util.AddPrefixPath(lm.VolumeMountPrefix, model.URL)
  374. }
  375. deployModelURL := jobConfig.DeployModel.URL
  376. if err := job.Storage.CopyFile(model.URL, deployModelURL); err != nil {
  377. return nil, fmt.Errorf("copy model(url=%s) to the deploy model(url=%s) failed, error: %+v",
  378. model.URL, deployModelURL, err)
  379. }
  380. klog.V(4).Infof("copy model(url=%s) to the deploy model(url=%s) successfully", model.URL, deployModelURL)
  381. klog.Infof("job(name=%s) deploys model(url=%s) successfully", jobConfig.UniqueIdentifier, model.URL)
  382. return model, nil
  383. }
  384. // createOutputDir creates the job output dir
  385. func (job *Job) createOutputDir(jobConfig *LLJobConfig) error {
  386. outputDir := jobConfig.OutputDir
  387. dirNames := []string{"data/train", "data/eval", "train", "eval"}
  388. // lifelong_kb_index.pkl
  389. if job.Storage.IsLocalStorage {
  390. if err := util.CreateFolder(outputDir); err != nil {
  391. klog.Errorf("job(name=%s) create fold %s failed", jobConfig.UniqueIdentifier, outputDir)
  392. return err
  393. }
  394. for _, v := range dirNames {
  395. dir := path.Join(outputDir, v)
  396. if err := util.CreateFolder(dir); err != nil {
  397. klog.Errorf("job(name=%s) create fold %s failed", jobConfig.UniqueIdentifier, dir)
  398. return err
  399. }
  400. }
  401. }
  402. outputConfig := LLOutputConfig{
  403. SamplesOutput: map[string]string{
  404. "train": strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[0]}, "/"),
  405. "eval": strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[1]}, "/"),
  406. },
  407. TrainOutput: strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[2]}, "/"),
  408. EvalOutput: strings.Join([]string{strings.TrimRight(outputDir, "/"), dirNames[3]}, "/"),
  409. }
  410. jobConfig.OutputConfig = &outputConfig
  411. return nil
  412. }
  413. // createFile creates data file and data index file
  414. func (job *Job) createFile(dir string, format string, isLocalStorage bool) (string, string) {
  415. switch strings.ToLower(format) {
  416. case dataset.TXTFormat:
  417. if isLocalStorage {
  418. return path.Join(dir, "data.txt"), ""
  419. }
  420. return strings.Join([]string{dir, "data.txt"}, "/"), strings.Join([]string{dir, "dataIndex.txt"}, "/")
  421. case dataset.CSVFormat:
  422. return strings.Join([]string{dir, "data.csv"}, "/"), ""
  423. }
  424. return "", ""
  425. }
  426. // writeLLJSamples writes samples information to a file
  427. func (job *Job) writeLLJSamples(samples []string, dir string) (string, string, error) {
  428. version := job.JobConfig.Version
  429. format := job.Dataset.Spec.Format
  430. urlPrefix := job.Dataset.URLPrefix
  431. subDir := strings.Join([]string{dir, strconv.Itoa(version)}, "/")
  432. fileURL, absURLFile := job.createFile(subDir, format, job.Dataset.Storage.IsLocalStorage)
  433. if job.Storage.IsLocalStorage {
  434. if err := util.CreateFolder(subDir); err != nil {
  435. return "", "", err
  436. }
  437. if err := job.writeByLine(samples, fileURL, format); err != nil {
  438. return "", "", err
  439. }
  440. if !job.Dataset.Storage.IsLocalStorage && absURLFile != "" {
  441. tempSamples := util.ParsingDatasetIndex(samples, urlPrefix)
  442. if err := job.writeByLine(tempSamples, absURLFile, format); err != nil {
  443. return "", "", err
  444. }
  445. }
  446. return fileURL, absURLFile, nil
  447. }
  448. temporaryDir, err := util.CreateTemporaryDir()
  449. if err != nil {
  450. return "", "", err
  451. }
  452. localFileURL, localAbsURLFile := job.createFile(temporaryDir, format, job.Dataset.Storage.IsLocalStorage)
  453. if err := job.writeByLine(samples, localFileURL, format); err != nil {
  454. return "", "", err
  455. }
  456. if err := job.Storage.Upload(localFileURL, fileURL); err != nil {
  457. return "", "", err
  458. }
  459. if absURLFile != "" {
  460. tempSamples := util.ParsingDatasetIndex(samples, urlPrefix)
  461. if err := job.writeByLine(tempSamples, localAbsURLFile, format); err != nil {
  462. return "", "", err
  463. }
  464. if err := job.Storage.Upload(localAbsURLFile, absURLFile); err != nil {
  465. return "", "", err
  466. }
  467. defer os.RemoveAll(localFileURL)
  468. }
  469. defer os.RemoveAll(localAbsURLFile)
  470. return fileURL, absURLFile, nil
  471. }
  472. // writeByLine writes file by line
  473. func (job *Job) writeByLine(samples []string, fileURL string, format string) error {
  474. file, err := os.Create(fileURL)
  475. if err != nil {
  476. klog.Errorf("create file(%s) failed", fileURL)
  477. return err
  478. }
  479. w := bufio.NewWriter(file)
  480. if format == "csv" {
  481. _, _ = fmt.Fprintln(w, job.Dataset.DataSource.Header)
  482. }
  483. for _, line := range samples {
  484. _, _ = fmt.Fprintln(w, line)
  485. }
  486. if err := w.Flush(); err != nil {
  487. klog.Errorf("write file(%s) failed", fileURL)
  488. return err
  489. }
  490. if err := file.Close(); err != nil {
  491. klog.Errorf("close file failed, error: %v", err)
  492. return err
  493. }
  494. return nil
  495. }
  496. // handleData updates samples information
  497. func (lm *Manager) handleData(job *Job) {
  498. tick := time.NewTicker(LLHandlerDataIntervalSeconds * time.Second)
  499. jobConfig := job.JobConfig
  500. iterCount := 0
  501. for {
  502. select {
  503. case <-job.Done:
  504. return
  505. default:
  506. }
  507. // in case dataset is not synced to LC before job synced to LC
  508. // here call loadDataset in each period
  509. err := lm.loadDataset(job)
  510. if iterCount%100 == 0 {
  511. klog.Infof("job(name=%s) handling dataset", jobConfig.UniqueIdentifier)
  512. }
  513. iterCount++
  514. if err != nil {
  515. klog.Warningf("job(name=%s) failed to load dataset, and waiting it: %v",
  516. jobConfig.UniqueIdentifier,
  517. err)
  518. <-tick.C
  519. continue
  520. }
  521. dataset := job.Dataset
  522. if dataset.DataSource != nil && len(dataset.DataSource.TrainSamples) > jobConfig.DataSamples.Numbers {
  523. samples := dataset.DataSource.TrainSamples
  524. trainNum := int(job.Spec.Dataset.TrainProb * float64(len(samples)-jobConfig.DataSamples.Numbers))
  525. jobConfig.Lock.Lock()
  526. jobConfig.DataSamples.TrainSamples = append(jobConfig.DataSamples.TrainSamples,
  527. samples[(jobConfig.DataSamples.Numbers+1):(jobConfig.DataSamples.Numbers+trainNum+1)]...)
  528. klog.Infof("job(name=%s) current train samples nums is %d",
  529. jobConfig.UniqueIdentifier, len(jobConfig.DataSamples.TrainSamples))
  530. jobConfig.DataSamples.EvalVersionSamples = append(jobConfig.DataSamples.EvalVersionSamples,
  531. samples[(jobConfig.DataSamples.Numbers+trainNum+1):])
  532. jobConfig.Lock.Unlock()
  533. for _, v := range jobConfig.DataSamples.EvalVersionSamples {
  534. jobConfig.DataSamples.EvalSamples = append(jobConfig.DataSamples.EvalSamples, v...)
  535. }
  536. klog.Infof("job(name=%s) current eval samples nums is %d",
  537. jobConfig.UniqueIdentifier, len(jobConfig.DataSamples.EvalSamples))
  538. jobConfig.DataSamples.Numbers = len(samples)
  539. }
  540. <-tick.C
  541. }
  542. }
  543. func (lm *Manager) loadDataset(job *Job) error {
  544. if job.Dataset != nil {
  545. // already loaded
  546. return nil
  547. }
  548. datasetName := util.GetUniqueIdentifier(job.Namespace, job.Spec.Dataset.Name, dataset.KindName)
  549. dataset, ok := lm.DatasetManager.GetDataset(datasetName)
  550. if !ok || dataset == nil {
  551. return fmt.Errorf("not exists dataset(name=%s)", datasetName)
  552. }
  553. jobConfig := job.JobConfig
  554. jobConfig.DataSamples = &LLDataSamples{
  555. Numbers: 0,
  556. TrainSamples: make([]string, 0),
  557. EvalVersionSamples: make([][]string, 0),
  558. EvalSamples: make([]string, 0),
  559. }
  560. job.Dataset = dataset
  561. return nil
  562. }
  563. // initJob inits the job object
  564. func (lm *Manager) initJob(job *Job) error {
  565. jobConfig := job.JobConfig
  566. jobConfig.TrainModel = new(Model)
  567. jobConfig.EvalResult = new(Model)
  568. jobConfig.Lock = sync.Mutex{}
  569. jobConfig.Version = 0
  570. jobConfig.Phase = TrainPhase
  571. jobConfig.WorkerStatus = workertypes.ReadyStatus
  572. jobConfig.TriggerStatus = TriggerReadyStatus
  573. trainTrigger, err := newLLTrigger(job.Spec.TrainSpec.Trigger)
  574. if err != nil {
  575. return fmt.Errorf("failed to init train trigger: %+w", err)
  576. }
  577. jobConfig.TrainTrigger = trainTrigger
  578. outputDir := job.Spec.OutputDir
  579. isLocalURL, err := job.Storage.IsLocalURL(outputDir)
  580. if err != nil {
  581. return fmt.Errorf("job(name=%s)'s output dir is invalid, error: %+v", job.Name, outputDir)
  582. }
  583. if isLocalURL {
  584. job.Storage.IsLocalStorage = true
  585. outputDir = util.AddPrefixPath(lm.VolumeMountPrefix, outputDir)
  586. }
  587. jobConfig.OutputDir = outputDir
  588. if err := job.createOutputDir(jobConfig); err != nil {
  589. return err
  590. }
  591. jobConfig.DeployModel = &Model{
  592. Format: "pkl",
  593. URL: strings.Join([]string{strings.TrimRight(outputDir, "/"), "deploy/index.pkl"}, "/"),
  594. }
  595. return nil
  596. }
  597. func newLLTrigger(t sednav1.LLTrigger) (trigger.Base, error) {
  598. // convert trigger to map
  599. triggerMap := make(map[string]interface{})
  600. c, err := json.Marshal(t)
  601. if err != nil {
  602. return nil, err
  603. }
  604. err = json.Unmarshal(c, &triggerMap)
  605. if err != nil {
  606. return nil, err
  607. }
  608. return trigger.NewTrigger(triggerMap)
  609. }
  610. // forwardSamplesLL deletes the samples information in the memory
  611. func forwardSamplesLL(jobConfig *LLJobConfig) {
  612. switch jobConfig.Phase {
  613. case TrainPhase:
  614. {
  615. jobConfig.Lock.Lock()
  616. jobConfig.DataSamples.TrainSamples = jobConfig.DataSamples.TrainSamples[:0]
  617. jobConfig.Lock.Unlock()
  618. }
  619. case EvalPhase:
  620. {
  621. if len(jobConfig.DataSamples.EvalVersionSamples) > LLEvalSamplesCapacity {
  622. jobConfig.DataSamples.EvalVersionSamples = jobConfig.DataSamples.EvalVersionSamples[1:]
  623. }
  624. }
  625. }
  626. }
  627. // backLLTaskStatus backs train task status
  628. func backLLTaskStatus(jobConfig *LLJobConfig) {
  629. jobConfig.Phase = TrainPhase
  630. initLLTaskStatus(jobConfig)
  631. }
  632. // initLLTaskStatus inits task status
  633. func initLLTaskStatus(jobConfig *LLJobConfig) {
  634. jobConfig.WorkerStatus = workertypes.ReadyStatus
  635. jobConfig.TriggerStatus = TriggerReadyStatus
  636. }
  637. // nextLLTask converts next task status
  638. func nextLLTask(jobConfig *LLJobConfig) {
  639. switch jobConfig.Phase {
  640. case TrainPhase:
  641. {
  642. forwardSamplesLL(jobConfig)
  643. initLLTaskStatus(jobConfig)
  644. jobConfig.Phase = EvalPhase
  645. }
  646. case EvalPhase:
  647. {
  648. forwardSamplesLL(jobConfig)
  649. initLLTaskStatus(jobConfig)
  650. jobConfig.Phase = DeployPhase
  651. }
  652. case DeployPhase:
  653. {
  654. backLLTaskStatus(jobConfig)
  655. }
  656. }
  657. }
  658. // Delete deletes lifelong-learning-job config in db
  659. func (lm *Manager) Delete(message *clienttypes.Message) error {
  660. name := util.GetUniqueIdentifier(message.Header.Namespace, message.Header.ResourceName, message.Header.ResourceKind)
  661. if job, ok := lm.LifelongLearningJobMap[name]; ok && job.Done != nil {
  662. close(job.Done)
  663. }
  664. delete(lm.LifelongLearningJobMap, name)
  665. if err := db.DeleteResource(name); err != nil {
  666. return err
  667. }
  668. return nil
  669. }
  670. // Start starts LifelongLearningJob manager
  671. func (lm *Manager) Start() error {
  672. go lm.monitorWorker()
  673. return nil
  674. }
  675. // monitorWorker monitors message from worker
  676. func (lm *Manager) monitorWorker() {
  677. for {
  678. workerMessageChannel := lm.WorkerMessageChannel
  679. workerMessage, ok := <-workerMessageChannel
  680. if !ok {
  681. break
  682. }
  683. klog.V(4).Infof("handling worker message %+v", workerMessage)
  684. name := util.GetUniqueIdentifier(workerMessage.Namespace, workerMessage.OwnerName, workerMessage.OwnerKind)
  685. job, ok := lm.LifelongLearningJobMap[name]
  686. if !ok {
  687. continue
  688. }
  689. // TODO: filter some worker messages out
  690. wo := clienttypes.Output{}
  691. wo.Models = workerMessage.Results
  692. wo.OwnerInfo = workerMessage.OwnerInfo
  693. msg := &clienttypes.UpstreamMessage{
  694. Phase: workerMessage.Kind,
  695. Status: workerMessage.Status,
  696. Output: &wo,
  697. }
  698. lm.Client.WriteMessage(msg, job.getHeader())
  699. lm.handleWorkerMessage(job, workerMessage)
  700. }
  701. }
  702. // handleWorkerMessage handles message from worker
  703. func (lm *Manager) handleWorkerMessage(job *Job, workerMessage workertypes.MessageContent) {
  704. jobPhase := job.JobConfig.Phase
  705. workerKind := workerMessage.Kind
  706. if jobPhase != workerKind {
  707. klog.Warningf("job(name=%s) %s phase get worker(kind=%s)", job.JobConfig.UniqueIdentifier,
  708. jobPhase, workerKind)
  709. return
  710. }
  711. var models []*Model
  712. for _, result := range workerMessage.Results {
  713. model := Model{
  714. Format: result["format"].(string),
  715. URL: result["url"].(string)}
  716. models = append(models, &model)
  717. }
  718. model := &Model{}
  719. if len(models) != 1 {
  720. return
  721. }
  722. model = models[0]
  723. job.JobConfig.WorkerStatus = workerMessage.Status
  724. if job.JobConfig.WorkerStatus == workertypes.CompletedStatus {
  725. switch job.JobConfig.Phase {
  726. case TrainPhase:
  727. job.JobConfig.TrainModel = model
  728. case EvalPhase:
  729. job.JobConfig.EvalResult = model
  730. }
  731. }
  732. }
  733. // AddWorkerMessage adds worker messages
  734. func (lm *Manager) AddWorkerMessage(message workertypes.MessageContent) {
  735. lm.WorkerMessageChannel <- message
  736. }
  737. // GetName returns name of the manager
  738. func (lm *Manager) GetName() string {
  739. return KindName
  740. }
  741. func (job *Job) getHeader() clienttypes.MessageHeader {
  742. return clienttypes.MessageHeader{
  743. Namespace: job.Namespace,
  744. ResourceKind: job.Kind,
  745. ResourceName: job.Name,
  746. Operation: clienttypes.StatusOperation,
  747. }
  748. }