You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

lifelonglearningjob.go 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799
  1. /*
  2. Copyright 2021 The KubeEdge Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package lifelonglearning
  14. import (
  15. "context"
  16. "encoding/json"
  17. "fmt"
  18. "strings"
  19. "time"
  20. v1 "k8s.io/api/core/v1"
  21. "k8s.io/apimachinery/pkg/api/errors"
  22. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  23. "k8s.io/apimachinery/pkg/types"
  24. utilrand "k8s.io/apimachinery/pkg/util/rand"
  25. utilruntime "k8s.io/apimachinery/pkg/util/runtime"
  26. "k8s.io/apimachinery/pkg/util/wait"
  27. "k8s.io/apimachinery/pkg/watch"
  28. "k8s.io/client-go/kubernetes"
  29. v1core "k8s.io/client-go/kubernetes/typed/core/v1"
  30. corelisters "k8s.io/client-go/listers/core/v1"
  31. "k8s.io/client-go/tools/cache"
  32. "k8s.io/client-go/tools/record"
  33. "k8s.io/client-go/util/workqueue"
  34. "k8s.io/klog/v2"
  35. k8scontroller "k8s.io/kubernetes/pkg/controller"
  36. sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
  37. sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
  38. sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
  39. "github.com/kubeedge/sedna/pkg/globalmanager/config"
  40. "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
  41. )
  42. const (
  43. // KindName is the kind name of CR this controller controls
  44. KindName = "LifelongLearningJob"
  45. // Name is this controller name
  46. Name = "LifelongLearning"
  47. )
  48. // Kind contains the schema.GroupVersionKind for this controller type.
  49. var Kind = sednav1.SchemeGroupVersion.WithKind(KindName)
  50. // Controller ensures that all LifelongLearningJob objects have corresponding pods to
  51. // run their configured workload.
  52. type Controller struct {
  53. kubeClient kubernetes.Interface
  54. client sednaclientset.SednaV1alpha1Interface
  55. // podStoreSynced returns true if the pod store has been synced at least once.
  56. // Added as a member to the struct to allow injection for testing.
  57. podStoreSynced cache.InformerSynced
  58. // jobStoreSynced returns true if the lifelonglearningjob store has been synced at least once.
  59. // Added as a member to the struct to allow injection for testing.
  60. jobStoreSynced cache.InformerSynced
  61. // A store of jobs
  62. jobLister sednav1listers.LifelongLearningJobLister
  63. // A store of pods, populated by the podController
  64. podStore corelisters.PodLister
  65. // LifelongLearningJobs that need to be updated
  66. queue workqueue.RateLimitingInterface
  67. cfg *config.ControllerConfig
  68. sendToEdgeFunc runtime.DownstreamSendFunc
  69. }
  70. // Run starts the main goroutine responsible for watching and syncing jobs.
  71. func (c *Controller) Run(stopCh <-chan struct{}) {
  72. workers := 1
  73. defer utilruntime.HandleCrash()
  74. defer c.queue.ShutDown()
  75. klog.Infof("Starting %s controller", Name)
  76. defer klog.Infof("Shutting down %s controller", Name)
  77. if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) {
  78. klog.Errorf("failed to wait for %s caches to sync", Name)
  79. return
  80. }
  81. klog.Infof("Starting %s workers", Name)
  82. for i := 0; i < workers; i++ {
  83. go wait.Until(c.worker, time.Second, stopCh)
  84. }
  85. <-stopCh
  86. }
  87. // enqueueByPod enqueues the lifelonglearningjob object of the specified pod.
  88. func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
  89. controllerRef := metav1.GetControllerOf(pod)
  90. if controllerRef == nil {
  91. return
  92. }
  93. if controllerRef.Kind != Kind.Kind {
  94. return
  95. }
  96. service, err := c.jobLister.LifelongLearningJobs(pod.Namespace).Get(controllerRef.Name)
  97. if err != nil {
  98. return
  99. }
  100. if service.UID != controllerRef.UID {
  101. return
  102. }
  103. c.enqueueController(service, immediate)
  104. }
  105. // When a pod is created, enqueue the controller that manages it and update it's expectations.
  106. func (c *Controller) addPod(obj interface{}) {
  107. pod := obj.(*v1.Pod)
  108. if pod.DeletionTimestamp != nil {
  109. // on a restart of the controller, it's possible a new pod shows up in a state that
  110. // is already pending deletion. Prevent the pod from being a creation observation.
  111. c.deletePod(pod)
  112. return
  113. }
  114. // backoff to queue when PodFailed
  115. immediate := pod.Status.Phase != v1.PodFailed
  116. c.enqueueByPod(pod, immediate)
  117. }
  118. // When a pod is updated, figure out what lifelonglearning job manage it and wake them up.
  119. func (c *Controller) updatePod(old, cur interface{}) {
  120. curPod := cur.(*v1.Pod)
  121. oldPod := old.(*v1.Pod)
  122. // no pod update, no queue
  123. if curPod.ResourceVersion == oldPod.ResourceVersion {
  124. return
  125. }
  126. c.addPod(curPod)
  127. }
  128. // deletePod enqueues the lifelonglearningjob obj When a pod is deleted
  129. func (c *Controller) deletePod(obj interface{}) {
  130. pod, ok := obj.(*v1.Pod)
  131. // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
  132. // When a delete is dropped, the relist will notice a pod in the store not
  133. // in the list, leading to the insertion of a tombstone object which contains
  134. // the deleted key/value. Note that this value might be stale. If the pod
  135. // changed labels the new lifelonglearningjob will not be woken up till the periodic resync.
  136. if !ok {
  137. tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
  138. if !ok {
  139. klog.Warningf("couldn't get object from tombstone %+v", obj)
  140. return
  141. }
  142. pod, ok = tombstone.Obj.(*v1.Pod)
  143. if !ok {
  144. klog.Warningf("tombstone contained object that is not a pod %+v", obj)
  145. return
  146. }
  147. }
  148. c.enqueueByPod(pod, true)
  149. }
  150. // obj could be an *sedna.LifelongLearningJob, or a DeletionFinalStateUnknown marker item,
  151. // immediate tells the controller to update the status right away, and should
  152. // happen ONLY when there was a successful pod run.
  153. func (c *Controller) enqueueController(obj interface{}, immediate bool) {
  154. key, err := k8scontroller.KeyFunc(obj)
  155. if err != nil {
  156. utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
  157. return
  158. }
  159. backoff := time.Duration(0)
  160. if !immediate {
  161. backoff = runtime.GetBackoff(c.queue, key)
  162. }
  163. c.queue.AddAfter(key, backoff)
  164. }
  165. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
  166. // It enforces that the syncHandler is never invoked concurrently with the same key.
  167. func (c *Controller) worker() {
  168. for c.processNextWorkItem() {
  169. }
  170. }
  171. func (c *Controller) processNextWorkItem() bool {
  172. key, quit := c.queue.Get()
  173. if quit {
  174. return false
  175. }
  176. defer c.queue.Done(key)
  177. forget, err := c.sync(key.(string))
  178. if err == nil {
  179. if forget {
  180. c.queue.Forget(key)
  181. }
  182. return true
  183. }
  184. utilruntime.HandleError(fmt.Errorf("Error syncing lifelonglearning job: %v", err))
  185. c.queue.AddRateLimited(key)
  186. return true
  187. }
  188. // sync will sync the lifelonglearning job with the given key if it has had its expectations fulfilled, meaning
  189. // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
  190. // concurrently with the same key.
  191. func (c *Controller) sync(key string) (bool, error) {
  192. startTime := time.Now()
  193. defer func() {
  194. klog.V(4).Infof("Finished syncing lifelonglearning job %q (%v)", key, time.Since(startTime))
  195. }()
  196. ns, name, err := cache.SplitMetaNamespaceKey(key)
  197. if err != nil {
  198. return false, err
  199. }
  200. if len(ns) == 0 || len(name) == 0 {
  201. return false, fmt.Errorf("invalid lifelonglearning job key %q: either namespace or name is missing", key)
  202. }
  203. sharedJob, err := c.jobLister.LifelongLearningJobs(ns).Get(name)
  204. if err != nil {
  205. if errors.IsNotFound(err) {
  206. klog.V(4).Infof("lifelonglearning job has been deleted: %v", key)
  207. return true, nil
  208. }
  209. return false, err
  210. }
  211. job := *sharedJob
  212. // set kind for lifelonglearningjob in case that the kind is None
  213. job.SetGroupVersionKind(Kind)
  214. if job.Status.StartTime == nil {
  215. // job is first in
  216. now := metav1.Now()
  217. job.Status.StartTime = &now
  218. }
  219. // if job was finished previously, we don't want to redo the termination
  220. if IsJobFinished(&job) {
  221. return true, nil
  222. }
  223. forget := false
  224. jobFailed := false
  225. needUpdated := false
  226. // transit this job's state machine
  227. needUpdated, err = c.transitJobState(&job)
  228. if err != nil {
  229. klog.V(2).Infof("lifelonglearning job %v/%v failed to be updated, err:%s", job.Namespace, job.Name, err)
  230. }
  231. if needUpdated {
  232. if err := c.updateJobStatus(&job); err != nil {
  233. return forget, err
  234. }
  235. if jobFailed && !IsJobFinished(&job) {
  236. // returning an error will re-enqueue LifelongLearningJob after the backoff period
  237. return forget, fmt.Errorf("failed pod(s) detected for lifelonglearningjob key %q", key)
  238. }
  239. forget = true
  240. }
  241. return forget, err
  242. }
  243. // setWorkerNodeNameOfJob sets the worker nodeName of the specified job
  244. // which is used for downstream to sync job info to the specified LC located in nodeName.
  245. func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.LifelongLearningJob, jobStage string, nodeName string) error {
  246. key := runtime.AnnotationsKeyPrefix + jobStage
  247. return c.addJobAnnotations(job, key, nodeName)
  248. }
  249. // addJobAnnotations adds info in job annotations
  250. func (c *Controller) addJobAnnotations(job *sednav1.LifelongLearningJob, key string, value string) error {
  251. ann := job.GetAnnotations()
  252. if ann[key] == value {
  253. // already set
  254. return nil
  255. }
  256. patchData := metav1.PartialObjectMetadata{
  257. ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{key: value}}}
  258. patchDataBytes, err := json.Marshal(&patchData)
  259. if err != nil {
  260. return err
  261. }
  262. jobClient := c.client.LifelongLearningJobs(job.Namespace)
  263. return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
  264. newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
  265. if err != nil {
  266. return err
  267. }
  268. annotations := newJob.GetAnnotations()
  269. if annotations[key] == value {
  270. return nil
  271. }
  272. _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, patchDataBytes, metav1.PatchOptions{})
  273. return err
  274. })
  275. }
  276. // transitJobState transit job to next state
  277. func (c *Controller) transitJobState(job *sednav1.LifelongLearningJob) (bool, error) {
  278. var initialType sednav1.LLJobStageConditionType
  279. var latestCondition sednav1.LLJobCondition = sednav1.LLJobCondition{
  280. Stage: sednav1.LLJobTrain,
  281. Type: initialType,
  282. }
  283. var newConditionType sednav1.LLJobStageConditionType
  284. var needUpdated = false
  285. var podStatus v1.PodPhase = v1.PodUnknown
  286. var pod *v1.Pod
  287. jobConditions := job.Status.Conditions
  288. if len(jobConditions) > 0 {
  289. // get latest pod and pod status
  290. latestCondition = (jobConditions)[len(jobConditions)-1]
  291. klog.V(2).Infof("lifelonglearning job %v/%v latest stage %v:", job.Namespace, job.Name,
  292. latestCondition.Stage)
  293. pod = c.getSpecifiedPods(job, string(latestCondition.Stage))
  294. if pod != nil {
  295. podStatus = pod.Status.Phase
  296. }
  297. }
  298. jobStage := latestCondition.Stage
  299. currentType := latestCondition.Type
  300. newConditionType = currentType
  301. switch currentType {
  302. case initialType:
  303. newConditionType = sednav1.LLJobStageCondWaiting
  304. case sednav1.LLJobStageCondWaiting:
  305. // do nothing, waiting for LC to set type from waiting to ready
  306. case sednav1.LLJobStageCondReady:
  307. // create a pod, and set type from ready to starting
  308. // include train, eval, deploy pod
  309. var err error
  310. if jobStage == sednav1.LLJobDeploy {
  311. err = c.restartInferPod(job)
  312. if err != nil {
  313. klog.V(2).Infof("lifelonglearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err)
  314. return needUpdated, err
  315. }
  316. klog.V(2).Infof("lifelonglearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name)
  317. newConditionType = sednav1.LLJobStageCondCompleted
  318. } else {
  319. if podStatus != v1.PodPending && podStatus != v1.PodRunning {
  320. err = c.createPod(job, jobStage)
  321. if err != nil {
  322. return needUpdated, err
  323. }
  324. }
  325. newConditionType = sednav1.LLJobStageCondStarting
  326. }
  327. case sednav1.LLJobStageCondStarting, sednav1.LLJobStageCondRunning:
  328. if podStatus == v1.PodRunning {
  329. // add nodeName to job
  330. if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil {
  331. return needUpdated, err
  332. }
  333. // watch pod status, if pod running, set type running
  334. newConditionType = sednav1.LLJobStageCondRunning
  335. } else if podStatus == v1.PodSucceeded {
  336. // watch pod status, if pod completed, set type completed
  337. newConditionType = sednav1.LLJobStageCondCompleted
  338. klog.V(2).Infof("lifelonglearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage)
  339. } else if podStatus == v1.PodFailed {
  340. newConditionType = sednav1.LLJobStageCondFailed
  341. klog.V(2).Infof("lifelonglearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage)
  342. }
  343. case sednav1.LLJobStageCondCompleted:
  344. jobStage = c.getNextStage(jobStage)
  345. newConditionType = sednav1.LLJobStageCondWaiting
  346. case sednav1.LLJobStageCondFailed:
  347. jobStage = sednav1.LLJobTrain
  348. newConditionType = sednav1.LLJobStageCondWaiting
  349. default:
  350. // do nothing when given other type out of cases
  351. }
  352. klog.V(2).Infof("lifelonglearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions)
  353. if latestCondition.Type != newConditionType {
  354. job.Status.Conditions = append(job.Status.Conditions, NewJobCondition(newConditionType, jobStage))
  355. needUpdated = true
  356. return needUpdated, nil
  357. }
  358. return needUpdated, nil
  359. }
  360. // updateJobStatus ensures that jobstatus can be updated rightly
  361. func (c *Controller) updateJobStatus(job *sednav1.LifelongLearningJob) error {
  362. jobClient := c.client.LifelongLearningJobs(job.Namespace)
  363. return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
  364. newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
  365. if err != nil {
  366. return err
  367. }
  368. newJob.Status = job.Status
  369. _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{})
  370. return err
  371. })
  372. }
  373. func NewJobCondition(conditionType sednav1.LLJobStageConditionType, jobStage sednav1.LLJobStage) sednav1.LLJobCondition {
  374. return sednav1.LLJobCondition{
  375. Type: conditionType,
  376. Status: v1.ConditionTrue,
  377. LastHeartbeatTime: metav1.Now(),
  378. LastTransitionTime: metav1.Now(),
  379. Reason: "",
  380. Message: "",
  381. Stage: jobStage,
  382. }
  383. }
  384. func (c *Controller) generatePodName(jobName string, workerType string) string {
  385. return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5)
  386. }
  387. func (c *Controller) getSpecifiedPods(job *sednav1.LifelongLearningJob, podType string) *v1.Pod {
  388. if podType == "Deploy" {
  389. podType = runtime.InferencePodType
  390. }
  391. var latestPod *v1.Pod
  392. selector, _ := runtime.GenerateSelector(job)
  393. pods, err := c.podStore.Pods(job.Namespace).List(selector)
  394. if len(pods) == 0 || err != nil {
  395. return nil
  396. }
  397. var matchTag = false
  398. latestPod = pods[0]
  399. for _, pod := range pods {
  400. s := strings.Split(pod.Name, "-")
  401. CurrentPodType := s[len(s)-2]
  402. if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && CurrentPodType == strings.ToLower(podType) {
  403. latestPod = pod
  404. matchTag = true
  405. }
  406. }
  407. if !matchTag {
  408. return nil
  409. }
  410. return latestPod
  411. }
  412. func (c *Controller) restartInferPod(job *sednav1.LifelongLearningJob) error {
  413. inferPod := c.getSpecifiedPods(job, runtime.InferencePodType)
  414. if inferPod == nil {
  415. klog.V(2).Infof("No inferpod is running in lifelonglearning job %v/%v", job.Namespace, job.Name)
  416. err := c.createInferPod(job)
  417. return err
  418. }
  419. ctx := context.Background()
  420. err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
  421. if err != nil {
  422. klog.Warningf("failed to delete inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
  423. return err
  424. }
  425. err = c.createInferPod(job)
  426. if err != nil {
  427. klog.Warningf("failed to create inference pod %s for lifelonglearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
  428. return err
  429. }
  430. return nil
  431. }
  432. func (c *Controller) getNextStage(currentStage sednav1.LLJobStage) sednav1.LLJobStage {
  433. switch currentStage {
  434. case sednav1.LLJobTrain:
  435. return sednav1.LLJobEval
  436. case sednav1.LLJobEval:
  437. return sednav1.LLJobDeploy
  438. case sednav1.LLJobDeploy:
  439. return sednav1.LLJobTrain
  440. default:
  441. return sednav1.LLJobTrain
  442. }
  443. }
  444. func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
  445. if name != "" {
  446. secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
  447. if err != nil {
  448. err = fmt.Errorf("failed to get the secret %s for %s: %w",
  449. name,
  450. ownerStr, err)
  451. }
  452. }
  453. return
  454. }
  455. func IsJobFinished(j *sednav1.LifelongLearningJob) bool {
  456. // TODO
  457. return false
  458. }
  459. func (c *Controller) createPod(job *sednav1.LifelongLearningJob, podtype sednav1.LLJobStage) (err error) {
  460. ctx := context.Background()
  461. var podTemplate *v1.PodTemplateSpec
  462. LLDatasetName := job.Spec.Dataset.Name
  463. dataset, err := c.client.Datasets(job.Namespace).Get(ctx, LLDatasetName, metav1.GetOptions{})
  464. if err != nil {
  465. return fmt.Errorf("failed to get dataset %s: %w", LLDatasetName, err)
  466. }
  467. datasetSecret, err := c.getSecret(
  468. job.Namespace,
  469. dataset.Spec.CredentialName,
  470. fmt.Sprintf("dataset %s", dataset.Name),
  471. )
  472. if err != nil {
  473. return err
  474. }
  475. jobSecret, err := c.getSecret(
  476. job.Namespace,
  477. job.Spec.CredentialName,
  478. fmt.Sprintf("lifelonglearning job %s", job.Name),
  479. )
  480. if err != nil {
  481. return err
  482. }
  483. // get all url for train and eval from data in condition
  484. condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data
  485. klog.V(2).Infof("lifelonglearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr)
  486. var cond ConditionData
  487. (&cond).Unmarshal([]byte(condDataStr))
  488. if cond.Input == nil {
  489. return fmt.Errorf("empty input from condData")
  490. }
  491. dataURL := cond.Input.DataURL
  492. inputmodelURLs := cond.GetInputModelURLs()
  493. var originalDataURLOrIndex string
  494. if cond.Input.DataIndexURL != "" {
  495. // this guarantee dataset.Spec.URL is not in host filesystem by LC,
  496. // but cond.Input.DataIndexURL could be in host filesystem.
  497. originalDataURLOrIndex = cond.Input.DataIndexURL
  498. } else {
  499. originalDataURLOrIndex = dataset.Spec.URL
  500. }
  501. var workerParam *runtime.WorkerParam = new(runtime.WorkerParam)
  502. if podtype == sednav1.LLJobTrain {
  503. workerParam.WorkerType = "Train"
  504. podTemplate = &job.Spec.TrainSpec.Template
  505. // Env parameters for train
  506. workerParam.Env = map[string]string{
  507. "NAMESPACE": job.Namespace,
  508. "JOB_NAME": job.Name,
  509. "WORKER_NAME": "train-worker-" + utilrand.String(5),
  510. "LC_SERVER": c.cfg.LC.Server,
  511. "KB_SERVER": c.cfg.KB.Server,
  512. }
  513. workerParam.Mounts = append(workerParam.Mounts,
  514. runtime.WorkerMount{
  515. URL: &runtime.MountURL{
  516. URL: cond.Input.OutputDir,
  517. Secret: jobSecret,
  518. DownloadByInitializer: false,
  519. },
  520. EnvName: "OUTPUT_URL",
  521. },
  522. runtime.WorkerMount{
  523. URL: &runtime.MountURL{
  524. URL: dataURL,
  525. Secret: jobSecret,
  526. DownloadByInitializer: true,
  527. },
  528. EnvName: "TRAIN_DATASET_URL",
  529. },
  530. // see https://github.com/kubeedge/sedna/issues/35
  531. runtime.WorkerMount{
  532. URL: &runtime.MountURL{
  533. Secret: datasetSecret,
  534. URL: originalDataURLOrIndex,
  535. Indirect: dataset.Spec.URL != originalDataURLOrIndex,
  536. DownloadByInitializer: true,
  537. },
  538. EnvName: "ORIGINAL_DATASET_URL",
  539. },
  540. )
  541. } else {
  542. podTemplate = &job.Spec.EvalSpec.Template
  543. workerParam.WorkerType = "Eval"
  544. // Configure Env information for eval by initial WorkerParam
  545. workerParam.Env = map[string]string{
  546. "NAMESPACE": job.Namespace,
  547. "JOB_NAME": job.Name,
  548. "WORKER_NAME": "eval-worker-" + utilrand.String(5),
  549. "LC_SERVER": c.cfg.LC.Server,
  550. "KB_SERVER": c.cfg.KB.Server,
  551. }
  552. var modelMountURLs []runtime.MountURL
  553. for _, url := range inputmodelURLs {
  554. modelMountURLs = append(modelMountURLs, runtime.MountURL{
  555. URL: url,
  556. Secret: jobSecret,
  557. DownloadByInitializer: true,
  558. })
  559. }
  560. workerParam.Mounts = append(workerParam.Mounts,
  561. runtime.WorkerMount{
  562. URLs: modelMountURLs,
  563. Name: "models",
  564. EnvName: "MODEL_URLS",
  565. },
  566. runtime.WorkerMount{
  567. URL: &runtime.MountURL{
  568. URL: cond.Input.OutputDir,
  569. Secret: jobSecret,
  570. DownloadByInitializer: false,
  571. },
  572. EnvName: "OUTPUT_URL",
  573. },
  574. runtime.WorkerMount{
  575. URL: &runtime.MountURL{
  576. URL: dataURL,
  577. Secret: datasetSecret,
  578. DownloadByInitializer: true,
  579. },
  580. Name: "datasets",
  581. EnvName: "TEST_DATASET_URL",
  582. },
  583. runtime.WorkerMount{
  584. URL: &runtime.MountURL{
  585. Secret: datasetSecret,
  586. URL: originalDataURLOrIndex,
  587. DownloadByInitializer: true,
  588. Indirect: dataset.Spec.URL != originalDataURLOrIndex,
  589. },
  590. Name: "origin-dataset",
  591. EnvName: "ORIGINAL_DATASET_URL",
  592. },
  593. )
  594. }
  595. // set the default policy instead of Always policy
  596. workerParam.RestartPolicy = v1.RestartPolicyOnFailure
  597. workerParam.HostNetwork = true
  598. // create pod based on podtype
  599. _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam)
  600. if err != nil {
  601. return err
  602. }
  603. return
  604. }
  605. func (c *Controller) createInferPod(job *sednav1.LifelongLearningJob) error {
  606. inferModelURL := strings.Join([]string{strings.TrimRight(job.Spec.OutputDir, "/"), "deploy/index.pkl"}, "/")
  607. jobSecret, err := c.getSecret(
  608. job.Namespace,
  609. job.Spec.CredentialName,
  610. fmt.Sprintf("lifelonglearning job %s", job.Name),
  611. )
  612. if err != nil {
  613. return err
  614. }
  615. var workerParam *runtime.WorkerParam = new(runtime.WorkerParam)
  616. workerParam.Mounts = append(workerParam.Mounts,
  617. runtime.WorkerMount{
  618. URL: &runtime.MountURL{
  619. URL: inferModelURL,
  620. Secret: jobSecret,
  621. DownloadByInitializer: false,
  622. },
  623. Name: "models",
  624. EnvName: "MODEL_URLS",
  625. },
  626. )
  627. workerParam.Env = map[string]string{
  628. "NAMESPACE": job.Namespace,
  629. "JOB_NAME": job.Name,
  630. "WORKER_NAME": "inferworker-" + utilrand.String(5),
  631. "LC_SERVER": c.cfg.LC.Server,
  632. }
  633. workerParam.WorkerType = runtime.InferencePodType
  634. workerParam.HostNetwork = true
  635. // create edge pod
  636. _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam)
  637. return err
  638. }
  639. // New creates a new LifelongLearningJob controller that keeps the relevant pods
  640. // in sync with their corresponding LifelongLearningJob objects.
  641. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
  642. cfg := cc.Config
  643. podInformer := cc.KubeInformerFactory.Core().V1().Pods()
  644. jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().LifelongLearningJobs()
  645. eventBroadcaster := record.NewBroadcaster()
  646. eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})
  647. jc := &Controller{
  648. kubeClient: cc.KubeClient,
  649. client: cc.SednaClient.SednaV1alpha1(),
  650. queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name),
  651. cfg: cfg,
  652. }
  653. jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  654. AddFunc: func(obj interface{}) {
  655. jc.enqueueController(obj, true)
  656. jc.syncToEdge(watch.Added, obj)
  657. },
  658. UpdateFunc: func(old, cur interface{}) {
  659. jc.enqueueController(cur, true)
  660. jc.syncToEdge(watch.Added, cur)
  661. },
  662. DeleteFunc: func(obj interface{}) {
  663. jc.enqueueController(obj, true)
  664. jc.syncToEdge(watch.Deleted, obj)
  665. },
  666. })
  667. jc.jobLister = jobInformer.Lister()
  668. jc.jobStoreSynced = jobInformer.Informer().HasSynced
  669. podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  670. AddFunc: jc.addPod,
  671. UpdateFunc: jc.updatePod,
  672. DeleteFunc: jc.deletePod,
  673. })
  674. jc.podStore = podInformer.Lister()
  675. jc.podStoreSynced = podInformer.Informer().HasSynced
  676. return jc, nil
  677. }