You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

incrementallearningjob.go 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. /*
  2. Copyright 2021 The KubeEdge Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package incrementallearning
  14. import (
  15. "context"
  16. "encoding/json"
  17. "fmt"
  18. "strings"
  19. "time"
  20. v1 "k8s.io/api/core/v1"
  21. "k8s.io/apimachinery/pkg/api/errors"
  22. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  23. "k8s.io/apimachinery/pkg/types"
  24. utilrand "k8s.io/apimachinery/pkg/util/rand"
  25. utilruntime "k8s.io/apimachinery/pkg/util/runtime"
  26. "k8s.io/apimachinery/pkg/util/wait"
  27. "k8s.io/client-go/kubernetes"
  28. "k8s.io/client-go/kubernetes/scheme"
  29. v1core "k8s.io/client-go/kubernetes/typed/core/v1"
  30. corelisters "k8s.io/client-go/listers/core/v1"
  31. "k8s.io/client-go/tools/cache"
  32. "k8s.io/client-go/tools/record"
  33. "k8s.io/client-go/util/workqueue"
  34. "k8s.io/klog/v2"
  35. k8scontroller "k8s.io/kubernetes/pkg/controller"
  36. sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
  37. sednaclientset "github.com/kubeedge/sedna/pkg/client/clientset/versioned/typed/sedna/v1alpha1"
  38. sednav1listers "github.com/kubeedge/sedna/pkg/client/listers/sedna/v1alpha1"
  39. "github.com/kubeedge/sedna/pkg/globalmanager/config"
  40. "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
  41. )
  42. const (
  43. // Name is this controller name
  44. Name = "IncrementalLearning"
  45. // KindName is the kind name of CR this controller controls
  46. KindName = "IncrementalLearningJob"
  47. )
  48. // Kind contains the schema.GroupVersionKind for this controller type.
  49. var Kind = sednav1.SchemeGroupVersion.WithKind(KindName)
  50. // Controller ensures that all IncrementalLearningJob objects have corresponding pods to
  51. // run their configured workload.
  52. type Controller struct {
  53. kubeClient kubernetes.Interface
  54. client sednaclientset.SednaV1alpha1Interface
  55. // podStoreSynced returns true if the pod store has been synced at least once.
  56. // Added as a member to the struct to allow injection for testing.
  57. podStoreSynced cache.InformerSynced
  58. // jobStoreSynced returns true if the incrementaljob store has been synced at least once.
  59. // Added as a member to the struct to allow injection for testing.
  60. jobStoreSynced cache.InformerSynced
  61. // A store of jobs
  62. jobLister sednav1listers.IncrementalLearningJobLister
  63. // A store of pods, populated by the podController
  64. podStore corelisters.PodLister
  65. // IncrementalLearningJobs that need to be updated
  66. queue workqueue.RateLimitingInterface
  67. recorder record.EventRecorder
  68. cfg *config.ControllerConfig
  69. }
  70. // Run starts the main goroutine responsible for watching and syncing jobs.
  71. func (c *Controller) Run(stopCh <-chan struct{}) {
  72. // TODO: make workers parameter
  73. workers := 1
  74. defer utilruntime.HandleCrash()
  75. defer c.queue.ShutDown()
  76. klog.Infof("Starting %s controller", Name)
  77. defer klog.Infof("Shutting down %s controller", Name)
  78. if !cache.WaitForNamedCacheSync(Name, stopCh, c.podStoreSynced, c.jobStoreSynced) {
  79. klog.Errorf("failed to wait for %s caches to sync", Name)
  80. return
  81. }
  82. klog.Infof("Starting %s job workers", Name)
  83. for i := 0; i < workers; i++ {
  84. go wait.Until(c.worker, time.Second, stopCh)
  85. }
  86. <-stopCh
  87. }
  88. // enqueueByPod enqueues the jointInferenceService object of the specified pod.
  89. func (c *Controller) enqueueByPod(pod *v1.Pod, immediate bool) {
  90. controllerRef := metav1.GetControllerOf(pod)
  91. if controllerRef == nil {
  92. return
  93. }
  94. if controllerRef.Kind != Kind.Kind {
  95. return
  96. }
  97. service, err := c.jobLister.IncrementalLearningJobs(pod.Namespace).Get(controllerRef.Name)
  98. if err != nil {
  99. return
  100. }
  101. if service.UID != controllerRef.UID {
  102. return
  103. }
  104. c.enqueueController(service, immediate)
  105. }
  106. // When a pod is created, enqueue the controller that manages it and update it's expectations.
  107. func (c *Controller) addPod(obj interface{}) {
  108. pod := obj.(*v1.Pod)
  109. if pod.DeletionTimestamp != nil {
  110. // on a restart of the controller, it's possible a new pod shows up in a state that
  111. // is already pending deletion. Prevent the pod from being a creation observation.
  112. c.deletePod(pod)
  113. return
  114. }
  115. // backoff to queue when PodFailed
  116. immediate := pod.Status.Phase != v1.PodFailed
  117. c.enqueueByPod(pod, immediate)
  118. }
  119. // When a pod is updated, figure out what joint inference service manage it and wake them up.
  120. func (c *Controller) updatePod(old, cur interface{}) {
  121. curPod := cur.(*v1.Pod)
  122. oldPod := old.(*v1.Pod)
  123. // no pod update, no queue
  124. if curPod.ResourceVersion == oldPod.ResourceVersion {
  125. return
  126. }
  127. c.addPod(curPod)
  128. }
  129. // deletePod enqueues the jointinferenceservice obj When a pod is deleted
  130. func (c *Controller) deletePod(obj interface{}) {
  131. pod, ok := obj.(*v1.Pod)
  132. // comment from https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/job/job_controller.go
  133. // When a delete is dropped, the relist will notice a pod in the store not
  134. // in the list, leading to the insertion of a tombstone object which contains
  135. // the deleted key/value. Note that this value might be stale. If the pod
  136. // changed labels the new jointinferenceservice will not be woken up till the periodic resync.
  137. if !ok {
  138. tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
  139. if !ok {
  140. klog.Warningf("couldn't get object from tombstone %+v", obj)
  141. return
  142. }
  143. pod, ok = tombstone.Obj.(*v1.Pod)
  144. if !ok {
  145. klog.Warningf("tombstone contained object that is not a pod %+v", obj)
  146. return
  147. }
  148. }
  149. c.enqueueByPod(pod, true)
  150. }
  151. // obj could be an *sedna.IncrementalLearningJob, or a DeletionFinalStateUnknown marker item,
  152. // immediate tells the controller to update the status right away, and should
  153. // happen ONLY when there was a successful pod run.
  154. func (c *Controller) enqueueController(obj interface{}, immediate bool) {
  155. key, err := k8scontroller.KeyFunc(obj)
  156. if err != nil {
  157. utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
  158. return
  159. }
  160. backoff := time.Duration(0)
  161. if !immediate {
  162. backoff = runtime.GetBackoff(c.queue, key)
  163. }
  164. c.queue.AddAfter(key, backoff)
  165. }
  166. // worker runs a worker thread that just dequeues items, processes them, and marks them done.
  167. // It enforces that the syncHandler is never invoked concurrently with the same key.
  168. func (c *Controller) worker() {
  169. for c.processNextWorkItem() {
  170. }
  171. }
  172. func (c *Controller) processNextWorkItem() bool {
  173. key, quit := c.queue.Get()
  174. if quit {
  175. return false
  176. }
  177. defer c.queue.Done(key)
  178. forget, err := c.sync(key.(string))
  179. if err == nil {
  180. if forget {
  181. c.queue.Forget(key)
  182. }
  183. return true
  184. }
  185. utilruntime.HandleError(fmt.Errorf("Error syncing incrementallearning job: %v", err))
  186. c.queue.AddRateLimited(key)
  187. return true
  188. }
  189. // sync will sync the incrementallearning job with the given key if it has had its expectations fulfilled, meaning
  190. // it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
  191. // concurrently with the same key.
  192. func (c *Controller) sync(key string) (bool, error) {
  193. startTime := time.Now()
  194. defer func() {
  195. klog.V(4).Infof("Finished syncing incrementallearning job %q (%v)", key, time.Since(startTime))
  196. }()
  197. ns, name, err := cache.SplitMetaNamespaceKey(key)
  198. if err != nil {
  199. return false, err
  200. }
  201. if len(ns) == 0 || len(name) == 0 {
  202. return false, fmt.Errorf("invalid incrementallearning job key %q: either namespace or name is missing", key)
  203. }
  204. sharedIncrementalJob, err := c.jobLister.IncrementalLearningJobs(ns).Get(name)
  205. if err != nil {
  206. if errors.IsNotFound(err) {
  207. klog.V(4).Infof("incrementallearning job has been deleted: %v", key)
  208. return true, nil
  209. }
  210. return false, err
  211. }
  212. incrementaljob := *sharedIncrementalJob
  213. // set kind for incrementaljob in case that the kind is None
  214. incrementaljob.SetGroupVersionKind(sednav1.SchemeGroupVersion.WithKind("IncrementalLearningJob"))
  215. // incrementaljob first start, create pod for inference
  216. if incrementaljob.Status.StartTime == nil {
  217. now := metav1.Now()
  218. incrementaljob.Status.StartTime = &now
  219. pod := c.getSpecifiedPods(&incrementaljob, runtime.InferencePodType)
  220. if pod == nil {
  221. err = c.createInferPod(&incrementaljob)
  222. } else {
  223. if pod.Status.Phase != v1.PodRunning && pod.Status.Phase != v1.PodPending {
  224. err = c.createInferPod(&incrementaljob)
  225. }
  226. }
  227. if err != nil {
  228. return false, nil
  229. }
  230. }
  231. // if incrementaljob was finished previously, we don't want to redo the termination
  232. if IsIncrementalJobFinished(&incrementaljob) {
  233. return true, nil
  234. }
  235. forget := false
  236. jobFailed := false
  237. needUpdated := false
  238. // update conditions of incremental job
  239. needUpdated, err = c.updateIncrementalJobConditions(&incrementaljob)
  240. if err != nil {
  241. klog.V(2).Infof("incrementallearning job %v/%v faied to be updated, err:%s", incrementaljob.Namespace, incrementaljob.Name, err)
  242. }
  243. if needUpdated {
  244. if err := c.updateIncrementalJobStatus(&incrementaljob); err != nil {
  245. return forget, err
  246. }
  247. if jobFailed && !IsIncrementalJobFinished(&incrementaljob) {
  248. // returning an error will re-enqueue IncrementalJob after the backoff period
  249. return forget, fmt.Errorf("failed pod(s) detected for incrementaljob key %q", key)
  250. }
  251. forget = true
  252. }
  253. return forget, err
  254. }
  255. // setWorkerNodeNameOfJob sets the worker nodeName of the specified job
  256. // which is used for downstream to sync job info to the specified LC located in nodeName.
  257. func (c *Controller) setWorkerNodeNameOfJob(job *sednav1.IncrementalLearningJob, jobStage string, nodeName string) error {
  258. key := runtime.AnnotationsKeyPrefix + jobStage
  259. ann := job.GetAnnotations()
  260. if ann != nil {
  261. if ann[key] == nodeName {
  262. // already set
  263. return nil
  264. }
  265. }
  266. jobClient := c.client.IncrementalLearningJobs(job.Namespace)
  267. var err error
  268. for i := 0; i <= runtime.ResourceUpdateRetries; i++ {
  269. var newJob *sednav1.IncrementalLearningJob
  270. newJob, err = jobClient.Get(context.TODO(), job.Name, metav1.GetOptions{})
  271. if err != nil {
  272. break
  273. }
  274. annotations := newJob.GetAnnotations()
  275. if annotations != nil {
  276. if annotations[key] == nodeName {
  277. return nil
  278. }
  279. }
  280. dataStr := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"%s"}}}`, key, nodeName)
  281. if _, err = jobClient.Patch(context.TODO(), job.Name, types.MergePatchType, []byte(dataStr), metav1.PatchOptions{}); err == nil {
  282. break
  283. }
  284. }
  285. return err
  286. }
  287. // updateIncrementalJobConditions ensures that conditions of incrementallearning job can be changed by podstatus
  288. func (c *Controller) updateIncrementalJobConditions(incrementaljob *sednav1.IncrementalLearningJob) (bool, error) {
  289. var initialType sednav1.ILJobStageConditionType
  290. var latestCondition sednav1.ILJobCondition = sednav1.ILJobCondition{
  291. Stage: sednav1.ILJobTrain,
  292. Type: initialType,
  293. }
  294. var newConditionType sednav1.ILJobStageConditionType
  295. var needUpdated = false
  296. jobConditions := incrementaljob.Status.Conditions
  297. var podStatus v1.PodPhase = v1.PodUnknown
  298. var pod *v1.Pod
  299. if len(jobConditions) > 0 {
  300. // get latest pod and pod status
  301. latestCondition = (jobConditions)[len(jobConditions)-1]
  302. klog.V(2).Infof("incrementallearning job %v/%v latest stage %v:", incrementaljob.Namespace, incrementaljob.Name,
  303. latestCondition.Stage)
  304. pod = c.getSpecifiedPods(incrementaljob, string(latestCondition.Stage))
  305. if pod != nil {
  306. podStatus = pod.Status.Phase
  307. }
  308. }
  309. jobStage := latestCondition.Stage
  310. currentType := latestCondition.Type
  311. newConditionType = currentType
  312. switch currentType {
  313. case initialType:
  314. newConditionType = sednav1.ILJobStageCondWaiting
  315. case sednav1.ILJobStageCondWaiting:
  316. // do nothing, waiting for LC to set type from waiting to ready
  317. case sednav1.ILJobStageCondReady:
  318. // create a pod, and set type from ready to starting
  319. // include train, eval, deploy pod
  320. var err error
  321. if jobStage == sednav1.ILJobDeploy {
  322. err = c.restartInferPod(incrementaljob)
  323. if err != nil {
  324. klog.V(2).Infof("incrementallearning job %v/%v inference pod failed to restart, err:%s", incrementaljob.Namespace, incrementaljob.Name, err)
  325. } else {
  326. klog.V(2).Infof("incrementallearning job %v/%v inference pod restarts successfully", incrementaljob.Namespace, incrementaljob.Name)
  327. }
  328. } else if podStatus != v1.PodPending && podStatus != v1.PodRunning {
  329. err = c.createPod(incrementaljob, jobStage)
  330. }
  331. if err != nil {
  332. return needUpdated, err
  333. }
  334. newConditionType = sednav1.ILJobStageCondStarting
  335. case sednav1.ILJobStageCondStarting, sednav1.ILJobStageCondRunning:
  336. if podStatus == v1.PodRunning {
  337. if jobStage == sednav1.ILJobDeploy {
  338. newConditionType = sednav1.ILJobStageCondCompleted
  339. } else {
  340. // watch pod status, if pod running, set type running
  341. newConditionType = sednav1.ILJobStageCondRunning
  342. // add nodeName to job
  343. if err := c.setWorkerNodeNameOfJob(incrementaljob, string(jobStage), pod.Spec.NodeName); err != nil {
  344. return needUpdated, err
  345. }
  346. }
  347. } else if podStatus == v1.PodSucceeded {
  348. // watch pod status, if pod completed, set type completed
  349. newConditionType = sednav1.ILJobStageCondCompleted
  350. klog.V(2).Infof("incrementallearning job %v/%v %v stage completed!", incrementaljob.Namespace, incrementaljob.Name, jobStage)
  351. } else if podStatus == v1.PodFailed {
  352. newConditionType = sednav1.ILJobStageCondFailed
  353. klog.V(2).Infof("incrementallearning job %v/%v %v stage failed!", incrementaljob.Namespace, incrementaljob.Name, jobStage)
  354. }
  355. case sednav1.ILJobStageCondCompleted:
  356. jobStage = getNextStage(jobStage)
  357. newConditionType = sednav1.ILJobStageCondWaiting
  358. case sednav1.ILJobStageCondFailed:
  359. jobStage = sednav1.ILJobTrain
  360. newConditionType = sednav1.ILJobStageCondWaiting
  361. default:
  362. // do nothing when given other type out of cases
  363. }
  364. klog.V(2).Infof("incrementallearning job %v/%v, conditions: %v", incrementaljob.Namespace, incrementaljob.Name, jobConditions)
  365. if latestCondition.Type != newConditionType {
  366. incrementaljob.Status.Conditions = append(incrementaljob.Status.Conditions, NewIncrementalJobCondition(newConditionType, jobStage))
  367. needUpdated = true
  368. return needUpdated, nil
  369. }
  370. return needUpdated, nil
  371. }
  372. // updateIncrementalJobStatus ensures that jobstatus can be updated rightly
  373. func (c *Controller) updateIncrementalJobStatus(incrementaljob *sednav1.IncrementalLearningJob) error {
  374. jobClient := c.client.IncrementalLearningJobs(incrementaljob.Namespace)
  375. var err error
  376. for i := 0; i <= runtime.ResourceUpdateRetries; i++ {
  377. var newIncrementalJob *sednav1.IncrementalLearningJob
  378. newIncrementalJob, err = jobClient.Get(context.TODO(), incrementaljob.Name, metav1.GetOptions{})
  379. if err != nil {
  380. break
  381. }
  382. newIncrementalJob.Status = incrementaljob.Status
  383. if _, err = jobClient.UpdateStatus(context.TODO(), newIncrementalJob, metav1.UpdateOptions{}); err == nil {
  384. break
  385. }
  386. }
  387. return err
  388. }
  389. func NewIncrementalJobCondition(conditionType sednav1.ILJobStageConditionType, jobStage sednav1.ILJobStage) sednav1.ILJobCondition {
  390. return sednav1.ILJobCondition{
  391. Type: conditionType,
  392. Status: v1.ConditionTrue,
  393. LastHeartbeatTime: metav1.Now(),
  394. LastTransitionTime: metav1.Now(),
  395. Reason: "",
  396. Message: "",
  397. Stage: jobStage,
  398. }
  399. }
  400. func (c *Controller) generatePodName(jobName string, workerType string) string {
  401. return jobName + "-" + strings.ToLower(workerType) + "-" + utilrand.String(5)
  402. }
  403. func (c *Controller) getSpecifiedPods(job *sednav1.IncrementalLearningJob, podType string) *v1.Pod {
  404. if podType == "Deploy" {
  405. podType = runtime.InferencePodType
  406. }
  407. var latestPod *v1.Pod
  408. selector, _ := runtime.GenerateSelector(job)
  409. pods, err := c.podStore.Pods(job.Namespace).List(selector)
  410. if len(pods) == 0 || err != nil {
  411. return nil
  412. }
  413. var matchTag = false
  414. latestPod = pods[0]
  415. for _, pod := range pods {
  416. s := strings.Split(pod.Name, "-")
  417. CurrentPodType := s[len(s)-2]
  418. if (latestPod.CreationTimestamp.Before(&pod.CreationTimestamp) || latestPod.CreationTimestamp.Equal(&pod.CreationTimestamp)) && CurrentPodType == strings.ToLower(podType) {
  419. latestPod = pod
  420. matchTag = true
  421. }
  422. }
  423. if !matchTag {
  424. return nil
  425. }
  426. return latestPod
  427. }
  428. func (c *Controller) restartInferPod(job *sednav1.IncrementalLearningJob) error {
  429. inferPod := c.getSpecifiedPods(job, runtime.InferencePodType)
  430. if inferPod == nil {
  431. klog.V(2).Infof("No inferpod is running in incrementallearning job %v/%v", job.Namespace, job.Name)
  432. err := c.createInferPod(job)
  433. return err
  434. }
  435. ctx := context.Background()
  436. err := c.kubeClient.CoreV1().Pods(job.Namespace).Delete(ctx, inferPod.Name, metav1.DeleteOptions{})
  437. if err != nil {
  438. klog.Warningf("failed to delete inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
  439. return err
  440. }
  441. err = c.createInferPod(job)
  442. if err != nil {
  443. klog.Warningf("failed to create inference pod %s for incrementallearning job %v/%v, err:%s", inferPod.Name, job.Namespace, job.Name, err)
  444. return err
  445. }
  446. return nil
  447. }
  448. func getNextStage(currentStage sednav1.ILJobStage) sednav1.ILJobStage {
  449. switch currentStage {
  450. case sednav1.ILJobTrain:
  451. return sednav1.ILJobEval
  452. case sednav1.ILJobEval:
  453. return sednav1.ILJobDeploy
  454. case sednav1.ILJobDeploy:
  455. return sednav1.ILJobTrain
  456. default:
  457. return sednav1.ILJobTrain
  458. }
  459. }
  460. func IsIncrementalJobFinished(j *sednav1.IncrementalLearningJob) bool {
  461. // TODO
  462. return false
  463. }
  464. func (c *Controller) getSecret(namespace, name string, ownerStr string) (secret *v1.Secret, err error) {
  465. if name != "" {
  466. secret, err = c.kubeClient.CoreV1().Secrets(namespace).Get(context.TODO(), name, metav1.GetOptions{})
  467. if err != nil {
  468. err = fmt.Errorf("failed to get the secret %s for %s: %w",
  469. name,
  470. ownerStr, err)
  471. }
  472. }
  473. return
  474. }
  475. func (c *Controller) createPod(job *sednav1.IncrementalLearningJob, podtype sednav1.ILJobStage) (err error) {
  476. ctx := context.Background()
  477. var podTemplate *v1.PodTemplateSpec
  478. incrementalDatasetName := job.Spec.Dataset.Name
  479. initialModelName := job.Spec.InitialModel.Name
  480. deployModelName := job.Spec.DeploySpec.Model.Name
  481. // check initial model name
  482. initialModel, err := c.client.Models(job.Namespace).Get(ctx, initialModelName, metav1.GetOptions{})
  483. if err != nil {
  484. return fmt.Errorf("failed to get initial model %s: %w",
  485. initialModelName, err)
  486. }
  487. _, err = c.client.Models(job.Namespace).Get(ctx, deployModelName, metav1.GetOptions{})
  488. if err != nil {
  489. return fmt.Errorf("failed to get deploy model %s: %w",
  490. deployModelName, err)
  491. }
  492. dataset, err := c.client.Datasets(job.Namespace).Get(ctx, incrementalDatasetName, metav1.GetOptions{})
  493. if err != nil {
  494. return fmt.Errorf("failed to get dataset %s: %w",
  495. incrementalDatasetName, err)
  496. }
  497. datasetSecret, err := c.getSecret(
  498. job.Namespace,
  499. dataset.Spec.CredentialName,
  500. fmt.Sprintf("dataset %s", dataset.Name),
  501. )
  502. if err != nil {
  503. return err
  504. }
  505. jobSecret, err := c.getSecret(
  506. job.Namespace,
  507. job.Spec.CredentialName,
  508. fmt.Sprintf("incremental job %s", job.Name),
  509. )
  510. if err != nil {
  511. return err
  512. }
  513. // get all url for train and eval from data in condition
  514. condDataStr := job.Status.Conditions[len(job.Status.Conditions)-1].Data
  515. klog.V(2).Infof("incrementallearning job %v/%v data condition:%s", job.Namespace, job.Name, condDataStr)
  516. var cond IncrementalCondData
  517. (&cond).Unmarshal([]byte(condDataStr))
  518. if cond.Input == nil {
  519. return fmt.Errorf("empty input from condData")
  520. }
  521. dataURL := cond.Input.DataURL
  522. inputmodelURLs := cond.GetInputModelURLs()
  523. var originalDataURLOrIndex string
  524. if cond.Input.DataIndexURL != "" {
  525. // this guarantee dataset.Spec.URL is not in host filesystem by LC,
  526. // but cond.Input.DataIndexURL could be in host filesystem.
  527. originalDataURLOrIndex = cond.Input.DataIndexURL
  528. } else {
  529. originalDataURLOrIndex = dataset.Spec.URL
  530. }
  531. var workerParam *runtime.WorkerParam = new(runtime.WorkerParam)
  532. if podtype == sednav1.ILJobTrain {
  533. workerParam.WorkerType = runtime.TrainPodType
  534. podTemplate = &job.Spec.TrainSpec.Template
  535. // Env parameters for train
  536. workerParam.Env = map[string]string{
  537. "NAMESPACE": job.Namespace,
  538. "JOB_NAME": job.Name,
  539. "WORKER_NAME": "train-worker-" + utilrand.String(5),
  540. "LC_SERVER": c.cfg.LC.Server,
  541. }
  542. baseModelURL := inputmodelURLs[0]
  543. var baseModelSecret *v1.Secret
  544. if baseModelURL == initialModel.Spec.URL {
  545. baseModelSecret, err = c.getSecret(
  546. job.Namespace,
  547. initialModel.Spec.CredentialName,
  548. fmt.Sprintf("initial model %s", initialModelName),
  549. )
  550. if err != nil {
  551. return err
  552. }
  553. } else {
  554. baseModelSecret = jobSecret
  555. }
  556. workerParam.Mounts = append(workerParam.Mounts,
  557. runtime.WorkerMount{
  558. URL: &runtime.MountURL{
  559. URL: baseModelURL,
  560. Secret: baseModelSecret,
  561. DownloadByInitializer: true,
  562. },
  563. EnvName: "BASE_MODEL_URL",
  564. },
  565. runtime.WorkerMount{
  566. URL: &runtime.MountURL{
  567. URL: cond.Input.OutputDir,
  568. Secret: jobSecret,
  569. DownloadByInitializer: false,
  570. },
  571. EnvName: "MODEL_URL",
  572. },
  573. runtime.WorkerMount{
  574. URL: &runtime.MountURL{
  575. URL: dataURL,
  576. DownloadByInitializer: true,
  577. Secret: jobSecret,
  578. },
  579. EnvName: "TRAIN_DATASET_URL",
  580. },
  581. // see https://github.com/kubeedge/sedna/issues/35
  582. runtime.WorkerMount{
  583. URL: &runtime.MountURL{
  584. Secret: datasetSecret,
  585. URL: originalDataURLOrIndex,
  586. DownloadByInitializer: true,
  587. Indirect: dataset.Spec.URL != originalDataURLOrIndex,
  588. },
  589. EnvName: "ORIGINAL_DATASET_URL",
  590. },
  591. )
  592. } else {
  593. podTemplate = &job.Spec.EvalSpec.Template
  594. workerParam.WorkerType = "Eval"
  595. // Configure Env information for eval by initial runtime.WorkerParam
  596. workerParam.Env = map[string]string{
  597. "NAMESPACE": job.Namespace,
  598. "JOB_NAME": job.Name,
  599. "WORKER_NAME": "eval-worker-" + utilrand.String(5),
  600. "LC_SERVER": c.cfg.LC.Server,
  601. }
  602. var modelMountURLs []runtime.MountURL
  603. for _, url := range inputmodelURLs {
  604. var modelSecret *v1.Secret
  605. if url == initialModel.Spec.URL {
  606. modelSecret, err = c.getSecret(
  607. job.Namespace,
  608. initialModel.Spec.CredentialName,
  609. fmt.Sprintf("initial model %s", initialModelName),
  610. )
  611. if err != nil {
  612. return err
  613. }
  614. } else {
  615. modelSecret = jobSecret
  616. }
  617. modelMountURLs = append(modelMountURLs, runtime.MountURL{
  618. URL: url,
  619. Secret: modelSecret,
  620. DownloadByInitializer: true,
  621. })
  622. }
  623. workerParam.Mounts = append(workerParam.Mounts,
  624. runtime.WorkerMount{
  625. URLs: modelMountURLs,
  626. Name: "models",
  627. EnvName: "MODEL_URLS",
  628. },
  629. runtime.WorkerMount{
  630. URL: &runtime.MountURL{
  631. URL: dataURL,
  632. Secret: datasetSecret,
  633. DownloadByInitializer: true,
  634. },
  635. Name: "datasets",
  636. EnvName: "TEST_DATASET_URL",
  637. },
  638. runtime.WorkerMount{
  639. URL: &runtime.MountURL{
  640. Secret: datasetSecret,
  641. URL: originalDataURLOrIndex,
  642. DownloadByInitializer: true,
  643. Indirect: dataset.Spec.URL != originalDataURLOrIndex,
  644. },
  645. Name: "origin-dataset",
  646. EnvName: "ORIGINAL_DATASET_URL",
  647. },
  648. )
  649. }
  650. // set the default policy instead of Always policy
  651. workerParam.RestartPolicy = v1.RestartPolicyOnFailure
  652. workerParam.HostNetwork = true
  653. // create pod based on podtype
  654. _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, podTemplate, workerParam)
  655. if err != nil {
  656. return err
  657. }
  658. return
  659. }
  660. func (c *Controller) createInferPod(job *sednav1.IncrementalLearningJob) error {
  661. infermodelName := job.Spec.DeploySpec.Model.Name
  662. inferModel, err := c.client.Models(job.Namespace).Get(context.TODO(), infermodelName, metav1.GetOptions{})
  663. if err != nil {
  664. return fmt.Errorf("failed to get infer model %s: %w",
  665. infermodelName, err)
  666. }
  667. inferModelURL := inferModel.Spec.URL
  668. // Env parameters for edge
  669. HEMParameterJSON, _ := json.Marshal(job.Spec.DeploySpec.HardExampleMining.Parameters)
  670. HEMParameterString := string(HEMParameterJSON)
  671. // Configure container mounting and Env information by initial runtime.WorkerParam
  672. modelSecret, err := c.getSecret(
  673. job.Namespace,
  674. inferModel.Spec.CredentialName,
  675. fmt.Sprintf("model %s", inferModel.Name),
  676. )
  677. var workerParam *runtime.WorkerParam = new(runtime.WorkerParam)
  678. workerParam.Mounts = append(workerParam.Mounts,
  679. runtime.WorkerMount{
  680. URL: &runtime.MountURL{
  681. URL: inferModelURL,
  682. Secret: modelSecret,
  683. DownloadByInitializer: true,
  684. },
  685. Name: "model",
  686. EnvName: "MODEL_URL",
  687. },
  688. )
  689. workerParam.Env = map[string]string{
  690. "NAMESPACE": job.Namespace,
  691. "JOB_NAME": job.Name,
  692. "WORKER_NAME": "inferworker-" + utilrand.String(5),
  693. "HEM_NAME": job.Spec.DeploySpec.HardExampleMining.Name,
  694. "HEM_PARAMETERS": HEMParameterString,
  695. "LC_SERVER": c.cfg.LC.Server,
  696. }
  697. workerParam.WorkerType = runtime.InferencePodType
  698. workerParam.HostNetwork = true
  699. // create edge pod
  700. _, err = runtime.CreatePodWithTemplate(c.kubeClient, job, &job.Spec.DeploySpec.Template, workerParam)
  701. return err
  702. }
  703. // New creates a new IncrementalJob controller that keeps the relevant pods
  704. // in sync with their corresponding IncrementalJob objects.
  705. func New(cc *runtime.ControllerContext) (runtime.FeatureControllerI, error) {
  706. podInformer := cc.KubeInformerFactory.Core().V1().Pods()
  707. jobInformer := cc.SednaInformerFactory.Sedna().V1alpha1().IncrementalLearningJobs()
  708. eventBroadcaster := record.NewBroadcaster()
  709. eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: cc.KubeClient.CoreV1().Events("")})
  710. jc := &Controller{
  711. kubeClient: cc.KubeClient,
  712. client: cc.SednaClient.SednaV1alpha1(),
  713. queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(runtime.DefaultBackOff, runtime.MaxBackOff), "incrementallearningjob"),
  714. recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "incrementallearningjob-controller"}),
  715. cfg: cc.Config,
  716. }
  717. jobInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  718. AddFunc: func(obj interface{}) {
  719. jc.enqueueController(obj, true)
  720. },
  721. UpdateFunc: func(old, cur interface{}) {
  722. jc.enqueueController(cur, true)
  723. },
  724. DeleteFunc: func(obj interface{}) {
  725. jc.enqueueController(obj, true)
  726. },
  727. })
  728. jc.jobLister = jobInformer.Lister()
  729. jc.jobStoreSynced = jobInformer.Informer().HasSynced
  730. podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
  731. AddFunc: jc.addPod,
  732. UpdateFunc: jc.updatePod,
  733. DeleteFunc: jc.deletePod,
  734. })
  735. jc.podStore = podInformer.Lister()
  736. jc.podStoreSynced = podInformer.Informer().HasSynced
  737. jc.addUpstreamHandler(cc)
  738. return jc, nil
  739. }