You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

incrementallearningjob.go 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. /*
  2. Copyright 2021 The KubeEdge Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package incrementallearning
  14. import (
  15. "context"
  16. "encoding/json"
  17. "fmt"
  18. "strings"
  19. "time"
  20. v1 "k8s.io/api/core/v1"
  21. "k8s.io/apimachinery/pkg/api/errors"
  22. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  23. "k8s.io/apimachinery/pkg/types"
  24. utilrand "k8s.io/apimachinery/pkg/util/rand"
  25. utilruntime "k8s.io/apimachinery/pkg/util/runtime"
  26. "k8s.io/apimachinery/pkg/util/wait"
  27. "k8s.io/apimachinery/pkg/watch"
  28. "k8s.io/client-go/kubernetes"
  29. v1core "k8s.io/client-go/kubernetes/typed/core/v1"
  30. corelisters "k8s.io/client-go/listers/core/v1"
  31. "k8s.io/client-go/tools/cache"
  32. "k8s.io/client-go/tools/record"
  33. "k8s.io/client-go/util/workqueue"
  34. "k8s.io/klog/v2"
  35. k8scontroller "k8s.io/kubernetes/pkg/controller"
  36. sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
  37. sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
  38. sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
  39. "github.com/kubeedge/sedna/pkg/globalmanager/config"
  40. "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
  41. )
  42. const (
  43. // Name is this controller name
  44. Name = "IncrementalLearning"
  45. // KindName is the kind name of CR this controller controls
  46. KindName = "IncrementalLearningJob"
  47. )
  48. // Kind contains the schema.GroupVersionKind for this controller type.
  49. var Kind = sednav1.SchemeGroupVersion.WithKind(KindName)
  50. // Controller ensures that all IncrementalLearningJob objects have corresponding pods to
  51. // run their configured workload.
  52. type Controller struct {
  53. kubeClient kubernetes.Interface
  54. client sednaclientset.SednaV1alpha1Interface
  55. // podStoreSynced returns true if the pod store has been synced at least once.
  56. // Added as a member to the struct to allow injection for testing.
  57. podStoreSynced cache.InformerSynced
  58. // jobStoreSynced returns true if the job store has been synced at least once.
  59. // Added as a member to the struct to allow injection for testing.
  60. jobStoreSynced cache.InformerSynced
  61. // A store of jobs
  62. jobLister sednav1listers.IncrementalLearningJobLister
  63. // A store of pods, populated by the podController
  64. podStore corelisters.PodLister
  65. // IncrementalLearningJobs that need to be updated
  66. queue workqueue.RateLimitingInterface
  67. cfg *config.ControllerConfig
  68. sendToEdgeFunc runtime.DownstreamSendFunc
  69. }
  70. // Run starts the main goroutine responsible for watching and syncing jobs.
  71. func (c *Controller) Run(stopCh <-chan struct{}) {
  72. // TODO: make workers parameter
  73. workers := 1
  74. defer utilruntime.HandleCrash()
  75. defer c.queue.ShutDown()
  76. klog.Infof("Starting %s controller", Name)
  77. defer klog.Infof("Shutting down %s controller", Name)
  78. if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) {
  79. klog.Errorf("failed to wait for %s caches to sync", Name)
  80. return
  81. }
  82. klog.Infof("Starting %s job workers", Name)
  83. for i := 0; i < workers; i++ {
  84. go wait.Until(c.worker, time.Second, stopCh)
  85. }
  86. <-stopCh
  87. }
  88. // enqueueByPod enqueues the jointInferenceService object of the specified pod.
  89. func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
  90. controllerRef := metav1.GetControllerOf(pod)
  91. if controllerRef == nil {
  92. return
  93. }
  94. if controllerRef.Kind != Kind.Kind {
  95. return
  96. }
  97. service, err := c.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name)
  98. if err != nil {
  99. return
  100. }
  101. if service.UID != controllerRef.UID {
  102. return
  103. }
  104. c.enqueueController(service, immediate)
  105. }
  106. // When a pod is created, enqueue the controller that manages it and update it's expectations.
  107. func (c *Controller) addPod(obj interface{}) {
  108. pod := obj.(*v1.Pod)
  109. if pod.DeletionTimestamp != nil {
  110. // on a restart of the controller, it's possible a new pod shows up in a state that
  111. // is already pending deletion. Prevent the pod from being a creation observation.
  112. c.deletePod(pod)
  113. return
  114. }
  115. // backoff to queue when PodFailed
  116. immediate := pod.Status.Phase != v1.PodFailed
  117. c.enqueueByPod(pod, immediate)
  118. }
  119. // When a pod is updated, figure out what joint inference service manage it and wake them up.
  120. func (c *Controller) updatePod(old, cur interface{}) {
  121. curPod := cur.(*v1.Pod)
  122. oldPod := old.(*v1.Pod)
  123. // no pod update, no queue
  124. if curPod.ResourceVersion == oldPod.ResourceVersion {
  125. return
  126. }
  127. c.addPod(curPod)
  128. }
  129. // deletePod enqueues the jointinferenceservice obj When a pod is deleted
  130. func (c *Controller) deletePod(obj interface{}) {
  131. pod, ok := obj.(*v1.Pod)
  132. // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
  133. // When a delete is dropped, the relist will notice a pod in the store not
  134. // in the list, leading to the insertion of a tombstone object which contains
  135. // the deleted key/value. Note that this value might be stale. If the pod
  136. // changed labels the new jointinferenceservice will not be woken up till the periodic resync.
  137. if !ok {
  138. tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
  139. if !ok {
  140. klog.Warningf("couldn't get object from tombstone %+v", obj)
  141. return
  142. }
  143. pod, ok = tombstone.Obj.(*v1.Pod)
  144. if !ok {
  145. klog.Warningf("tombstone contained object that is not a pod %+v", obj)
  146. return
  147. }
  148. }
  149. c.enqueueByPod(pod, true)
  150. }
  151. // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item,
  152. // immediate tells the controller to update the status right away, and should
  153. // happen ONLY when there was a successful pod run.
  154. func (c *Controller) enqueueController(obj interface{}, immediate bool) {
  155. key, err := k8scontroller.KeyFunc(obj)
  156. if err != nil {
  157. utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
  158. return
  159. }
  160. backoff := time.Duration(0)
  161. if !immediate {
  162. backoff = runtime.GetBackoff(c.queue, key)
  163. }
  164. c.queue.AddAfter(key, backoff)
  165. }
  166. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
  167. // It enforces that the syncHandler is never invoked concurrently with the same key.
  168. func (c *Controller) worker() {
  169. for c.processNextWorkItem() {
  170. }
  171. }
  172. func (c *Controller) processNextWorkItem() bool {
  173. key, quit := c.queue.Get()
  174. if quit {
  175. return false
  176. }
  177. defer c.queue.Done(key)
  178. forget, err := c.sync(key.(string))
  179. if err == nil {
  180. if forget {
  181. c.queue.Forget(key)
  182. }
  183. return true
  184. }
  185. utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err))
  186. c.queue.AddRateLimited(key)
  187. return true
  188. }
  189. // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning
  190. // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
  191. // concurrently with the same key.
  192. func (c *Controller) sync(key string) (bool, error) {
  193. startTime := time.Now()
  194. defer func() {
  195. klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime))
  196. }()
  197. ns, name, err := cache.SplitMetaNamespaceKey(key)
  198. if err != nil {
  199. return false, err
  200. }
  201. if len(ns) == 0 || len(name) == 0 {
  202. return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key)
  203. }
  204. sharedJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name)
  205. if err != nil {
  206. if errors.IsNotFound(err) {
  207. klog.V(4).Infof("incrementallearning job has been deleted: %v", key)
  208. return true, nil
  209. }
  210. return false, err
  211. }
  212. job := *sharedJob
  213. // set kind in case that the kind is None
  214. job.SetGroupVersionKind(Kind)
  215. // when job is handled at first, create pod for inference
  216. if job.Status.StartTime == nil {
  217. now := metav1.Now()
  218. job.Status.StartTime = &now
  219. pod := c.getSpecifiedPods(&job, runtime.InferencePodType)
  220. if pod == nil {
  221. err = c.createInferPod(&job)
  222. } else {
  223. if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending {
  224. err = c.createInferPod(&job)
  225. }
  226. }
  227. if err != nil {
  228. return false, nil
  229. }
  230. }
  231. // if job was finished previously, we don't want to redo the termination
  232. if IsJobFinished(&job) {
  233. return true, nil
  234. }
  235. forget := false
  236. jobFailed := false
  237. needUpdated := false
  238. // transit this job's state machine
  239. needUpdated, err = c.transitJobState(&job)
  240. if err != nil {
  241. klog.V(2).Infof("incrementallearning job %v/%v failed to be updated, err:%s", job.Namespace, job.Name, err)
  242. }
  243. if needUpdated {
  244. if err := c.updateJobStatus(&job); err != nil {
  245. return forget, err
  246. }
  247. if jobFailed && !IsJobFinished(&job) {
  248. // returning an error will re-enqueue IncrementalLearningJob after the backoff period
  249. return forget, fmt.Errorf("failed pod(s) detected for incrementallearning job key %q", key)
  250. }
  251. forget = true
  252. }
  253. return forget, err
  254. }
  255. // setWorkerNodeNameOfJob sets the worker nodeName of the specified job
  256. // which is used for downstream to sync job info to the specified LC located in nodeName.
  257. func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error {
  258. key := runtime.AnnotationsKeyPrefix + jobStage
  259. ann := job.GetAnnotations()
  260. if ann[key] == nodeName {
  261. // already set
  262. return nil
  263. }
  264. dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName)
  265. jobClient := c.client.IncrementalLearningJobs(job.Namespace)
  266. return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
  267. newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
  268. if err != nil {
  269. return err
  270. }
  271. annotations := newJob.GetAnnotations()
  272. if annotations[key] == nodeName {
  273. return nil
  274. }
  275. _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{})
  276. return err
  277. })
  278. }
  279. // transitJobState transit job to next state
  280. func (c *Controller) transitJobState(job *sednav1.IncrementalLearningJob) (bool, error) {
  281. var initialType sednav1.ILJobStageConditionType
  282. var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{
  283. Stage: sednav1.ILJobTrain,
  284. Type: initialType,
  285. }
  286. var newConditionType sednav1.ILJobStageConditionType
  287. var needUpdated = false
  288. var podStatus v1.PodPhase = v1.PodUnknown
  289. var pod *v1.Pod
  290. jobConditions := job.Status.Conditions
  291. if len(jobConditions) > 0 {
  292. // get latest pod and pod status
  293. latestCondition = (jobConditions)[len(jobConditions)-1]
  294. klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", job.Namespace, job.Name,
  295. latestCondition.Stage)
  296. pod = c.getSpecifiedPods(job, string(latestCondition.Stage))
  297. if pod != nil {
  298. podStatus = pod.Status.Phase
  299. }
  300. }
  301. jobStage := latestCondition.Stage
  302. currentType := latestCondition.Type
  303. newConditionType = currentType
  304. switch currentType {
  305. case initialType:
  306. newConditionType = sednav1.ILJobStageCondWaiting
  307. case sednav1.ILJobStageCondWaiting:
  308. // do nothing, waiting for LC to set type from waiting to ready
  309. case sednav1.ILJobStageCondReady:
  310. // create a pod, and set type from ready to starting
  311. // include train, eval, deploy pod
  312. var err error
  313. if jobStage == sednav1.ILJobDeploy {
  314. err = c.restartInferPod(job)
  315. if err != nil {
  316. klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", job.Namespace, job.Name, err)
  317. } else {
  318. klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", job.Namespace, job.Name)
  319. }
  320. } else if podStatus != v1.PodPending && podStatus != v1.PodRunning {
  321. err = c.createPod(job, jobStage)
  322. }
  323. if err != nil {
  324. return needUpdated, err
  325. }
  326. newConditionType = sednav1.ILJobStageCondStarting
  327. case sednav1.ILJobStageCondStarting, sednav1.ILJobStageCondRunning:
  328. if podStatus == v1.PodRunning {
  329. if jobStage == sednav1.ILJobDeploy {
  330. newConditionType = sednav1.ILJobStageCondCompleted
  331. } else {
  332. // watch pod status, if pod running, set type running
  333. newConditionType = sednav1.ILJobStageCondRunning
  334. // add nodeName to job
  335. if err := c.setWorkerNodeNameOfJob(job, string(jobStage), pod.Spec.NodeName); err != nil {
  336. return needUpdated, err
  337. }
  338. }
  339. } else if podStatus == v1.PodSucceeded {
  340. // watch pod status, if pod completed, set type completed
  341. newConditionType = sednav1.ILJobStageCondCompleted
  342. klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", job.Namespace, job.Name, jobStage)
  343. } else if podStatus == v1.PodFailed {
  344. newConditionType = sednav1.ILJobStageCondFailed
  345. klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", job.Namespace, job.Name, jobStage)
  346. }
  347. case sednav1.ILJobStageCondCompleted:
  348. jobStage = getNextStage(jobStage)
  349. newConditionType = sednav1.ILJobStageCondWaiting
  350. case sednav1.ILJobStageCondFailed:
  351. jobStage = sednav1.ILJobTrain
  352. newConditionType = sednav1.ILJobStageCondWaiting
  353. default:
  354. // do nothing when given other type out of cases
  355. }
  356. klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", job.Namespace, job.Name, jobConditions)
  357. if latestCondition.Type != newConditionType {
  358. job.Status.Conditions = append(job.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage))
  359. needUpdated = true
  360. }
  361. return needUpdated, nil
  362. }
  363. // updateJobStatus ensures that job status can be updated rightly
  364. func (c *Controller) updateJobStatus(job *sednav1.IncrementalLearningJob) error {
  365. jobClient := c.client.IncrementalLearningJobs(job.Namespace)
  366. return runtime.RetryUpdateStatus(job.Name, job.Namespace, func() error {
  367. newJob, err := jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
  368. if err != nil {
  369. return err
  370. }
  371. newJob.Status = job.Status
  372. _, err = jobClient.UpdateStatus(context.TODO(), newJob, metav1.UpdateOptions{})
  373. return err
  374. })
  375. }
  376. func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition {
  377. return sednav1.ILJobCondition{
  378. Type: conditionType,
  379. Status: v1.ConditionTrue,
  380. LastHeartbeatTime: metav1.Now(),
  381. LastTransitionTime: metav1.Now(),
  382. Reason: "",
  383. Message: "",
  384. Stage: jobStage,
  385. }
  386. }
  387. func (c *Controller) generatePodName(jobName string, workerType string) string {
  388. return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5)
  389. }
  390. func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod {
  391. var latestPod *v1.Pod
  392. selector, _ := runtime.GenerateSelector(job)
  393. pods, err := c.podStore.Pods(job.Namespace).List(selector)
  394. if len(pods) == 0 || err != nil {
  395. return nil
  396. }
  397. var matchTag = false
  398. latestPod = pods[0]
  399. if podType == "Deploy" {
  400. podType = runtime.InferencePodType
  401. }
  402. for _, pod := range pods {
  403. s := strings.Split(pod.Name, "-")
  404. currentPodType := s[len(s)-2]
  405. if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && currentPodType == strings.ToLower(podType) {
  406. latestPod = pod
  407. matchTag = true
  408. }
  409. }
  410. if !matchTag {
  411. return nil
  412. }
  413. return latestPod
  414. }
  415. func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error {
  416. inferPod := c.getSpecifiedPods(job, runtime.InferencePodType)
  417. if inferPod == nil {
  418. klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name)
  419. err := c.createInferPod(job)
  420. return err
  421. }
  422. ctx := context.Background()
  423. err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
  424. if err != nil {
  425. klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
  426. return err
  427. }
  428. err = c.createInferPod(job)
  429. if err != nil {
  430. klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
  431. return err
  432. }
  433. return nil
  434. }
  435. func getNextStage(currentStage sednav1.ILJobStage) sednav1.ILJobStage {
  436. switch currentStage {
  437. case sednav1.ILJobTrain:
  438. return sednav1.ILJobEval
  439. case sednav1.ILJobEval:
  440. return sednav1.ILJobDeploy
  441. case sednav1.ILJobDeploy:
  442. return sednav1.ILJobTrain
  443. default:
  444. return sednav1.ILJobTrain
  445. }
  446. }
  447. func IsJobFinished(j *sednav1.IncrementalLearningJob) bool {
  448. // TODO
  449. return false
  450. }
  451. func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
  452. if name != "" {
  453. secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
  454. if err != nil {
  455. err = fmt.Errorf("failed to get the secret %s for %s: %w",
  456. name,
  457. ownerStr, err)
  458. }
  459. }
  460. return
  461. }
  462. func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) {
  463. ctx := context.Background()
  464. var podTemplate *v1.PodTemplateSpec
  465. incrementalDatasetName := job.Spec.Dataset.Name
  466. initialModelName := job.Spec.InitialModel.Name
  467. deployModelName := job.Spec.DeploySpec.Model.Name
  468. // check initial model name
  469. initialModel, err := c.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{})
  470. if err != nil {
  471. return fmt.Errorf("failed to get initial model %s: %w",
  472. initialModelName, err)
  473. }
  474. _, err = c.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{})
  475. if err != nil {
  476. return fmt.Errorf("failed to get deploy model %s: %w",
  477. deployModelName, err)
  478. }
  479. dataset, err := c.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{})
  480. if err != nil {
  481. return fmt.Errorf("failed to get dataset %s: %w",
  482. incrementalDatasetName, err)
  483. }
  484. datasetSecret, err := c.getSecret(
  485. job.Namespace,
  486. dataset.Spec.CredentialName,
  487. fmt.Sprintf("dataset %s", dataset.Name),
  488. )
  489. if err != nil {
  490. return err
  491. }
  492. jobSecret, err := c.getSecret(
  493. job.Namespace,
  494. job.Spec.CredentialName,
  495. fmt.Sprintf("incremental job %s", job.Name),
  496. )
  497. if err != nil {
  498. return err
  499. }
  500. // get all url for train and eval from data in condition
  501. var cond IncrementalCondData
  502. condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data
  503. klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr)
  504. (&cond).Unmarshal([]byte(condDataStr))
  505. if cond.Input == nil {
  506. return fmt.Errorf("empty input from condData")
  507. }
  508. dataURL := cond.Input.DataURL
  509. inputmodelURLs := cond.GetInputModelURLs()
  510. var originalDataURLOrIndex string
  511. if cond.Input.DataIndexURL != "" {
  512. // this guarantee dataset.Spec.URL is not in host filesystem by LC,
  513. // but cond.Input.DataIndexURL could be in host filesystem.
  514. originalDataURLOrIndex = cond.Input.DataIndexURL
  515. } else {
  516. originalDataURLOrIndex = dataset.Spec.URL
  517. }
  518. var workerParam runtime.WorkerParam
  519. if podtype == sednav1.ILJobTrain {
  520. workerParam.WorkerType = runtime.TrainPodType
  521. podTemplate = &job.Spec.TrainSpec.Template
  522. // Env parameters for train
  523. workerParam.Env = map[string]string{
  524. "NAMESPACE": job.Namespace,
  525. "JOB_NAME": job.Name,
  526. "WORKER_NAME": "train-worker-" + utilrand.String(5),
  527. "LC_SERVER": c.cfg.LC.Server,
  528. }
  529. baseModelURL := inputmodelURLs[0]
  530. var baseModelSecret *v1.Secret
  531. if baseModelURL == initialModel.Spec.URL {
  532. baseModelSecret, err = c.getSecret(
  533. job.Namespace,
  534. initialModel.Spec.CredentialName,
  535. fmt.Sprintf("initial model %s", initialModelName),
  536. )
  537. if err != nil {
  538. return err
  539. }
  540. } else {
  541. baseModelSecret = jobSecret
  542. }
  543. workerParam.Mounts = append(workerParam.Mounts,
  544. runtime.WorkerMount{
  545. URL: &runtime.MountURL{
  546. URL: baseModelURL,
  547. Secret: baseModelSecret,
  548. DownloadByInitializer: true,
  549. },
  550. EnvName: "BASE_MODEL_URL",
  551. },
  552. runtime.WorkerMount{
  553. URL: &runtime.MountURL{
  554. URL: cond.Input.OutputDir,
  555. Secret: jobSecret,
  556. DownloadByInitializer: false,
  557. },
  558. EnvName: "MODEL_URL",
  559. },
  560. runtime.WorkerMount{
  561. URL: &runtime.MountURL{
  562. URL: dataURL,
  563. DownloadByInitializer: true,
  564. Secret: jobSecret,
  565. },
  566. EnvName: "TRAIN_DATASET_URL",
  567. },
  568. // see https://github.com/kubeedge/sedna/issues/35
  569. runtime.WorkerMount{
  570. URL: &runtime.MountURL{
  571. Secret: datasetSecret,
  572. URL: originalDataURLOrIndex,
  573. DownloadByInitializer: true,
  574. Indirect: dataset.Spec.URL != originalDataURLOrIndex,
  575. },
  576. EnvName: "ORIGINAL_DATASET_URL",
  577. },
  578. )
  579. } else {
  580. // Configure eval worker's mounts and envs
  581. podTemplate = &job.Spec.EvalSpec.Template
  582. workerParam.WorkerType = "Eval"
  583. workerParam.Env = map[string]string{
  584. "NAMESPACE": job.Namespace,
  585. "JOB_NAME": job.Name,
  586. "WORKER_NAME": "eval-worker-" + utilrand.String(5),
  587. "LC_SERVER": c.cfg.LC.Server,
  588. }
  589. var modelMountURLs []runtime.MountURL
  590. for _, url := range inputmodelURLs {
  591. var modelSecret *v1.Secret
  592. if url == initialModel.Spec.URL {
  593. modelSecret, err = c.getSecret(
  594. job.Namespace,
  595. initialModel.Spec.CredentialName,
  596. fmt.Sprintf("initial model %s", initialModelName),
  597. )
  598. if err != nil {
  599. return err
  600. }
  601. } else {
  602. modelSecret = jobSecret
  603. }
  604. modelMountURLs = append(modelMountURLs, runtime.MountURL{
  605. URL: url,
  606. Secret: modelSecret,
  607. DownloadByInitializer: true,
  608. })
  609. }
  610. workerParam.Mounts = append(workerParam.Mounts,
  611. runtime.WorkerMount{
  612. URLs: modelMountURLs,
  613. Name: "models",
  614. EnvName: "MODEL_URLS",
  615. },
  616. runtime.WorkerMount{
  617. URL: &runtime.MountURL{
  618. URL: dataURL,
  619. Secret: datasetSecret,
  620. DownloadByInitializer: true,
  621. },
  622. Name: "datasets",
  623. EnvName: "TEST_DATASET_URL",
  624. },
  625. runtime.WorkerMount{
  626. URL: &runtime.MountURL{
  627. Secret: datasetSecret,
  628. URL: originalDataURLOrIndex,
  629. DownloadByInitializer: true,
  630. Indirect: dataset.Spec.URL != originalDataURLOrIndex,
  631. },
  632. Name: "origin-dataset",
  633. EnvName: "ORIGINAL_DATASET_URL",
  634. },
  635. )
  636. }
  637. // set the default policy instead of Always policy
  638. workerParam.RestartPolicy = v1.RestartPolicyOnFailure
  639. workerParam.HostNetwork = true
  640. // create pod based on podtype
  641. _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, &workerParam)
  642. return
  643. }
  644. func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error {
  645. infermodelName := job.Spec.DeploySpec.Model.Name
  646. inferModel, err := c.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{})
  647. if err != nil {
  648. return fmt.Errorf("failed to get infer model %s: %w",
  649. infermodelName, err)
  650. }
  651. inferModelURL := inferModel.Spec.URL
  652. HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters)
  653. HEMParameterString := string(HEMParameterJSON)
  654. modelSecret, err := c.getSecret(
  655. job.Namespace,
  656. inferModel.Spec.CredentialName,
  657. fmt.Sprintf("model %s", inferModel.Name),
  658. )
  659. // Configure inference worker's mounts and envs
  660. var workerParam runtime.WorkerParam
  661. workerParam.Mounts = append(workerParam.Mounts,
  662. runtime.WorkerMount{
  663. URL: &runtime.MountURL{
  664. URL: inferModelURL,
  665. Secret: modelSecret,
  666. DownloadByInitializer: true,
  667. },
  668. Name: "model",
  669. EnvName: "MODEL_URL",
  670. },
  671. )
  672. workerParam.Env = map[string]string{
  673. "NAMESPACE": job.Namespace,
  674. "JOB_NAME": job.Name,
  675. "WORKER_NAME": "inferworker-" + utilrand.String(5),
  676. "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name,
  677. "HEM_PARAMETERS": HEMParameterString,
  678. "LC_SERVER": c.cfg.LC.Server,
  679. }
  680. workerParam.WorkerType = runtime.InferencePodType
  681. workerParam.HostNetwork = true
  682. // create the inference worker
  683. _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, &workerParam)
  684. return err
  685. }
  686. // New creates a new incremental learning job controller that keeps the relevant pods
  687. // in sync with the corresponding IncrementalLearningJob objects.
  688. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
  689. podInformer := cc.KubeInformerFactory.Core().V1().Pods()
  690. jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs()
  691. eventBroadcaster := record.NewBroadcaster()
  692. eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})
  693. jc := &Controller{
  694. kubeClient: cc.KubeClient,
  695. client: cc.SednaClient.SednaV1alpha1(),
  696. queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), Name),
  697. cfg: cc.Config,
  698. }
  699. jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  700. AddFunc: func(obj interface{}) {
  701. jc.enqueueController(obj, true)
  702. jc.syncToEdge(watch.Added, obj)
  703. },
  704. UpdateFunc: func(old, cur interface{}) {
  705. jc.enqueueController(cur, true)
  706. jc.syncToEdge(watch.Added, cur)
  707. },
  708. DeleteFunc: func(obj interface{}) {
  709. jc.enqueueController(obj, true)
  710. jc.syncToEdge(watch.Deleted, obj)
  711. },
  712. })
  713. jc.jobLister = jobInformer.Lister()
  714. jc.jobStoreSynced = jobInformer.Informer().HasSynced
  715. podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  716. AddFunc: jc.addPod,
  717. UpdateFunc: jc.updatePod,
  718. DeleteFunc: jc.deletePod,
  719. })
  720. jc.podStore = podInformer.Lister()
  721. jc.podStoreSynced = podInformer.Informer().HasSynced
  722. return jc, nil
  723. }