You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

downstream.go 6.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /*
  2. Copyright 2021 The KubeEdge Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package incrementallearning
  14. import (
  15. "context"
  16. "fmt"
  17. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  18. "k8s.io/apimachinery/pkg/util/sets"
  19. "k8s.io/apimachinery/pkg/watch"
  20. "k8s.io/klog/v2"
  21. sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
  22. "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
  23. )
  24. // syncModelWithName will sync the model to the specified node.
  25. // Now called when creating the incrementaljob.
  26. func (c *Controller) syncModelWithName(nodeName, modelName, namespace string) error {
  27. model, err := c.client.Models(namespace).Get(context.TODO(), modelName, metav1.GetOptions{})
  28. if err != nil {
  29. // TODO: maybe use err.ErrStatus.Code == 404
  30. return fmt.Errorf("model(%s/%s) not found", namespace, modelName)
  31. }
  32. // Since model.Kind may be empty,
  33. // we need to fix the kind here if missing.
  34. // more details at https://github.com/kubernetes/kubernetes/issues/3030
  35. if len(model.Kind) == 0 {
  36. model.Kind = "Model"
  37. }
  38. runtime.InjectSecretAnnotations(c.kubeClient, model, model.Spec.CredentialName)
  39. c.sendToEdgeFunc(nodeName, watch.Added, model)
  40. return nil
  41. }
  42. func (c *Controller) syncToEdge(eventType watch.EventType, obj interface{}) error {
  43. job, ok := obj.(*sednav1.IncrementalLearningJob)
  44. if !ok {
  45. return nil
  46. }
  47. // Since Kind may be empty,
  48. // we need to fix the kind here if missing.
  49. // more details at https://github.com/kubernetes/kubernetes/issues/3030
  50. job.Kind = KindName
  51. dataName := job.Spec.Dataset.Name
  52. // LC has dataset object on this node that may call dataset node
  53. var dsNodeName string
  54. ds, err := c.client.Datasets(job.Namespace).Get(context.TODO(), dataName, metav1.GetOptions{})
  55. if err != nil {
  56. klog.Errorf("not found job(name=%s/%s)'s dataset, error: %v", job.Kind, job.Name, err)
  57. } else {
  58. dsNodeName = ds.Spec.NodeName
  59. }
  60. var trainNodeName string
  61. var evalNodeName string
  62. var deployNodeName string
  63. getAnnotationsNodeName := func(nodeName sednav1.ILJobStage) string {
  64. return runtime.AnnotationsKeyPrefix + string(nodeName)
  65. }
  66. ann := job.GetAnnotations()
  67. if ann != nil {
  68. trainNodeName = ann[getAnnotationsNodeName(sednav1.ILJobTrain)]
  69. evalNodeName = ann[getAnnotationsNodeName(sednav1.ILJobEval)]
  70. if _, ok := ann[runtime.ModelHotUpdateAnnotationsKey]; ok {
  71. deployNodeName = ann[getAnnotationsNodeName(sednav1.ILJobDeploy)]
  72. }
  73. }
  74. if eventType == watch.Deleted {
  75. // delete jobs from all LCs
  76. nodes := sets.NewString(dsNodeName, trainNodeName, evalNodeName, deployNodeName)
  77. for node := range nodes {
  78. c.sendToEdgeFunc(node, eventType, job)
  79. }
  80. return nil
  81. }
  82. if dsNodeName == "" {
  83. return nil
  84. }
  85. jobConditions := job.Status.Conditions
  86. if len(jobConditions) == 0 {
  87. return nil
  88. }
  89. latestCondition := jobConditions[len(jobConditions)-1]
  90. currentType := latestCondition.Type
  91. jobStage := latestCondition.Stage
  92. syncModelWithName := func(modelName string, nodeName string) {
  93. if err := c.syncModelWithName(nodeName, modelName, job.Namespace); err != nil {
  94. klog.Warningf("Error to sync model %s when sync incremental learning job %s to node %s: %v",
  95. modelName, job.Name, nodeName, err)
  96. }
  97. }
  98. syncJobWithNodeName := func(nodeName string) {
  99. if err := c.sendToEdgeFunc(nodeName, eventType, job); err != nil {
  100. klog.Warningf("Error to sync incremental learning job %s to node %s in stage %s: %v",
  101. job.Name, nodeName, jobStage, err)
  102. }
  103. }
  104. runtime.InjectSecretAnnotations(c.kubeClient, job, job.Spec.CredentialName)
  105. // isJobResidentNode checks whether nodeName is a job resident node
  106. isJobResidentNode := func(nodeName string) bool {
  107. // the node where LC monitors dataset and the node where inference worker is running are job resident node
  108. if nodeName == dsNodeName || nodeName == deployNodeName {
  109. return true
  110. }
  111. return false
  112. }
  113. // delete job
  114. deleteJob := func(nodeName string) {
  115. if !isJobResidentNode(nodeName) {
  116. // delete LC's job from nodeName that's different from dataset node when worker's status
  117. // is completed or failed.
  118. c.sendToEdgeFunc(nodeName, watch.Deleted, job)
  119. }
  120. }
  121. switch currentType {
  122. case sednav1.ILJobStageCondWaiting:
  123. switch jobStage {
  124. case sednav1.ILJobTrain:
  125. syncModelWithName(job.Spec.InitialModel.Name, dsNodeName)
  126. syncJobWithNodeName(dsNodeName)
  127. case sednav1.ILJobEval:
  128. syncModelWithName(job.Spec.DeploySpec.Model.Name, dsNodeName)
  129. if job.Spec.EvalSpec.InitialModel != nil {
  130. syncModelWithName(job.Spec.EvalSpec.InitialModel.Name, dsNodeName)
  131. }
  132. syncJobWithNodeName(dsNodeName)
  133. case sednav1.ILJobDeploy:
  134. deployNodeName = evalNodeName
  135. syncModelWithName(job.Spec.DeploySpec.Model.Name, evalNodeName)
  136. if job.Spec.EvalSpec.InitialModel != nil && !job.Spec.DeploySpec.Model.HotUpdateEnabled {
  137. syncModelWithName(job.Spec.EvalSpec.InitialModel.Name, deployNodeName)
  138. }
  139. syncJobWithNodeName(deployNodeName)
  140. }
  141. case sednav1.ILJobStageCondRunning:
  142. switch jobStage {
  143. case sednav1.ILJobTrain:
  144. syncJobWithNodeName(trainNodeName)
  145. case sednav1.ILJobEval:
  146. if trainNodeName != evalNodeName && trainNodeName != dsNodeName {
  147. c.sendToEdgeFunc(trainNodeName, watch.Deleted, job)
  148. }
  149. syncJobWithNodeName(evalNodeName)
  150. case sednav1.ILJobDeploy:
  151. if evalNodeName != deployNodeName && evalNodeName != dsNodeName {
  152. c.sendToEdgeFunc(evalNodeName, watch.Deleted, job)
  153. }
  154. if job.Spec.EvalSpec.InitialModel != nil {
  155. syncModelWithName(job.Spec.EvalSpec.InitialModel.Name, deployNodeName)
  156. }
  157. syncModelWithName(job.Spec.DeploySpec.Model.Name, deployNodeName)
  158. syncJobWithNodeName(deployNodeName)
  159. }
  160. case sednav1.ILJobStageCondCompleted, sednav1.ILJobStageCondFailed:
  161. if !job.Spec.DeploySpec.Model.HotUpdateEnabled {
  162. deployNodeName = evalNodeName
  163. }
  164. switch jobStage {
  165. case sednav1.ILJobTrain:
  166. deleteJob(trainNodeName)
  167. case sednav1.ILJobEval:
  168. deleteJob(evalNodeName)
  169. case sednav1.ILJobDeploy:
  170. deleteJob(deployNodeName)
  171. }
  172. }
  173. return nil
  174. }
  175. func (c *Controller) SetDownstreamSendFunc(f runtime.DownstreamSendFunc) error {
  176. c.sendToEdgeFunc = f
  177. return nil
  178. }