You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  13. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  14. "gitlink.org.cn/JointCloud/pcm-openi/common"
  15. "mime/multipart"
  16. "net/http"
  17. "strconv"
  18. "strings"
  19. )
  20. const (
  21. RESOURCE_POOL = "common-pool"
  22. Param_Token = "token"
  23. Param_Addr = "addr"
  24. Forward_Slash = "/"
  25. COMMA = ","
  26. UNDERSCORE = "_"
  27. TASK_NAME_PREFIX = "trainJob"
  28. Python = "python "
  29. SemiColon = ";"
  30. )
  31. const (
  32. NotImplementError = "not implemented"
  33. )
  34. const (
  35. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  36. ResourcespecsUrl = "api/v1/resource/specs"
  37. CreateTrainJobUrl = "api/v1/job/create"
  38. TrainJobDetail = "api/v1/job/detail"
  39. )
  40. // compute source
  41. var (
  42. ComputeSourceToCardType = map[string]string{
  43. "nvidia-a100": "GPU",
  44. "nvidia-a100-80g": "GPU",
  45. "mr-v100": "ILUVATAR-GPGPU",
  46. "bi-v100": "ILUVATAR-GPGPU",
  47. "MR-V50": "ILUVATAR-GPGPU",
  48. "BI-V100": "ILUVATAR-GPGPU",
  49. "BI-V150": "ILUVATAR-GPGPU",
  50. "MR-V100": "ILUVATAR-GPGPU",
  51. "cambricon.com/mlu": "MLU",
  52. "hygon.com/dcu": "DCU",
  53. "huawei.com/Ascend910": "NPU",
  54. "enflame.com/gcu": "GCU",
  55. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  56. "MXN260": "METAX-GPGPU",
  57. }
  58. )
  59. type OctopusHttp struct {
  60. server string
  61. host string
  62. platform string
  63. participantId int64
  64. token *Token
  65. }
  66. func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
  67. token, _ := NewToken(host, user, pwd)
  68. return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
  69. }
  70. // executor
  71. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  72. switch mode {
  73. case executor.SUBMIT_MODE_JOINT_CLOUD:
  74. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  75. // cmd
  76. if option.AlgorithmId != "" {
  77. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  78. }
  79. option.ResourceId = "9e2feeae30e04492a4298755179f2ae0"
  80. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  81. if err != nil {
  82. return nil, err
  83. }
  84. return task, nil
  85. }
  86. return nil, nil
  87. }
  88. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  89. //TODO implement me
  90. panic("implement me")
  91. }
  92. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  93. // octopus提交任务
  94. reqUrl := o.server + CreateTrainJobUrl
  95. token, err := o.token.Get()
  96. if err != nil {
  97. return nil, err
  98. }
  99. // python参数
  100. var prms []struct {
  101. Key string `json:"key"`
  102. Value string `json:"value"`
  103. }
  104. for _, param := range params {
  105. var p struct {
  106. Key string `json:"key"`
  107. Value string `json:"value"`
  108. }
  109. s := strings.Split(param, COMMA)
  110. p.Key = s[0]
  111. p.Value = s[1]
  112. prms = append(prms, p)
  113. }
  114. //环境变量
  115. envMap := make(map[string]string)
  116. for _, env := range envs {
  117. s := strings.Split(env, COMMA)
  118. envMap[s[0]] = s[1]
  119. }
  120. param := &omodel.CreateTrainJobParam{
  121. //DataSetId: datasetsId,
  122. //DataSetVersion: VERSION,
  123. //AlgorithmId: algorithmId,
  124. //AlgorithmVersion: VERSION,
  125. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  126. ImageId: imageId,
  127. IsDistributed: false,
  128. ResourcePool: RESOURCE_POOL,
  129. Config: []*omodel.CreateTrainJobConf{
  130. {
  131. Command: cmd,
  132. ResourceSpecId: resourceId,
  133. MinFailedTaskCount: 1,
  134. MinSucceededTaskCount: 1,
  135. TaskNumber: 1,
  136. Parameters: prms,
  137. Envs: envMap,
  138. },
  139. },
  140. }
  141. resp := &entity.OctCreateJobResp{}
  142. req := common.GetRestyRequest(common.TIMEOUT)
  143. _, err = req.
  144. SetHeader("Authorization", "Bearer "+token).
  145. SetBody(param).
  146. SetResult(resp).
  147. Post(reqUrl)
  148. if err != nil {
  149. return nil, err
  150. }
  151. return resp, nil
  152. }
  153. // collector
  154. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResourceSpecsResp, error) {
  155. resourcespecsUrl := o.server + ResourcespecsUrl
  156. token, err := o.token.Get()
  157. if err != nil {
  158. return nil, err
  159. }
  160. param := omodel.ResourceSpecParam{
  161. ResourcePool: RESOURCE_POOL,
  162. }
  163. b, _ := json.Marshal(param)
  164. byt := bytes.NewBuffer(b)
  165. resp := &entity.OctResourceSpecsResp{}
  166. req := common.GetRestyRequest(common.TIMEOUT)
  167. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  168. req.RawRequest = r
  169. req.URL = resourcespecsUrl
  170. _, err = req.
  171. SetHeader("Content-Type", "application/json").
  172. SetQueryParam(Param_Token, token).
  173. SetQueryParam(Param_Addr, o.host).
  174. SetBody(byt).
  175. SetResult(resp).
  176. Send()
  177. if err != nil {
  178. return nil, err
  179. }
  180. return resp, nil
  181. }
  182. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  183. resp, err := o.resourceSpecs(ctx)
  184. if err != nil {
  185. return nil, err
  186. }
  187. if resp.Code != http.StatusOK {
  188. if resp.Data != nil {
  189. marshal, err := json.Marshal(resp.Data)
  190. if err != nil {
  191. return nil, err
  192. }
  193. errormdl := &omodel.Error{}
  194. err = json.Unmarshal(marshal, errormdl)
  195. if err != nil {
  196. return nil, err
  197. }
  198. return nil, errors.New(errormdl.Message)
  199. }
  200. } else {
  201. if resp.Data != nil {
  202. spec := &entity.OctResourceSpecs{}
  203. marshal, err := json.Marshal(resp.Data)
  204. if err != nil {
  205. return nil, err
  206. }
  207. err = json.Unmarshal(marshal, spec)
  208. if err != nil {
  209. return nil, err
  210. }
  211. }
  212. }
  213. return nil, nil
  214. }
  215. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  216. return nil, nil
  217. }
  218. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  219. //TODO implement me
  220. panic("implement me")
  221. }
  222. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  223. //TODO implement me
  224. panic("implement me")
  225. }
  226. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  227. //TODO implement me
  228. panic("implement me")
  229. }
  230. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  231. //TODO implement me
  232. panic("implement me")
  233. }
  234. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  235. //TODO implement me
  236. panic("implement me")
  237. }
  238. func (o OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  239. //TODO implement me
  240. panic("implement me")
  241. }
  242. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  243. //TODO implement me
  244. panic("implement me")
  245. }
  246. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  247. resp, err := o.resourceSpecs(ctx)
  248. if err != nil {
  249. return nil, err
  250. }
  251. res := &collector.ResourceSpec{
  252. ClusterId: strconv.FormatInt(o.participantId, 10),
  253. Tag: resrcType,
  254. }
  255. if resp.Code != http.StatusOK {
  256. if resp.Data != nil {
  257. marshal, err := json.Marshal(resp.Data)
  258. if err != nil {
  259. return nil, err
  260. }
  261. errormdl := &omodel.Error{}
  262. err = json.Unmarshal(marshal, errormdl)
  263. if err != nil {
  264. return nil, err
  265. }
  266. return nil, errors.New(errormdl.Message)
  267. }
  268. } else {
  269. if resp.Data != nil {
  270. specs := &entity.OctResourceSpecs{}
  271. marshal, err := json.Marshal(resp.Data)
  272. if err != nil {
  273. return nil, err
  274. }
  275. err = json.Unmarshal(marshal, specs)
  276. if err != nil {
  277. return nil, err
  278. }
  279. clusterResources, err := genSpecs(specs, resrcType)
  280. if err != nil {
  281. return nil, err
  282. }
  283. res.Resources = clusterResources
  284. }
  285. }
  286. return res, nil
  287. }
  288. func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
  289. res := make([]interface{}, 0)
  290. if resrcType == "Inference" {
  291. return res, nil
  292. } else if resrcType == "Train" {
  293. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  294. return res, nil
  295. } else {
  296. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  297. spec := &omodel.Spec{}
  298. marshal, err := json.Marshal(s)
  299. if err != nil {
  300. return nil, err
  301. }
  302. err = json.Unmarshal(marshal, specs)
  303. if err != nil {
  304. return nil, err
  305. }
  306. if spec.ResourceQuantity.BiV100 != "" {
  307. }
  308. //cres := &collector.ClusterResource{}
  309. //card := &collector.Usage{
  310. // Type: ComputeSource[i],
  311. // Name: strings.ToUpper(k),
  312. // Total: &collector.UnitValue{Unit: spec.ResourceQuantity, Value: v.AccCardsNum},
  313. // Available: &collector.UnitValue{Unit: NUMBER, Value: v.AccCardsNum},
  314. //}
  315. //spec.ResourceQuantity.
  316. }
  317. }
  318. }
  319. return nil, nil
  320. }
  321. // inference
  322. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  323. return nil, errors.New(NotImplementError)
  324. }
  325. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  326. return nil, errors.New(NotImplementError)
  327. }
  328. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  329. return false
  330. }
  331. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  332. return false
  333. }
  334. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  335. return nil, errors.New(NotImplementError)
  336. }
  337. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  338. return "", errors.New(NotImplementError)
  339. }
  340. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  341. return false
  342. }
  343. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  344. return "", errors.New(NotImplementError)
  345. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.