| @@ -1150,6 +1150,17 @@ type LogFile struct { | |||||
| Name string | Name string | ||||
| } | } | ||||
| type GetTrainJobMetricStatisticResult struct { | |||||
| TrainJobResult | |||||
| Interval int `json:"interval"` //查询的时间间隔,单位为分钟 | |||||
| MetricsInfo []Metrics `json:"metrics"` //监控详情 | |||||
| } | |||||
| type Metrics struct { | |||||
| Metric string `json:"metric"` //监控指标项 | |||||
| Value []string `json:"value"` //获取的监控值的序列,元素为String类型 | |||||
| } | |||||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { | ||||
| sess := x.NewSession() | sess := x.NewSession() | ||||
| defer sess.Close() | defer sess.Close() | ||||
| @@ -1119,3 +1119,44 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetTrainJobMetricStatisticResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic") | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| var temp models.ErrorResult | |||||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||||
| } | |||||
| log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("GetTrainJobMetricStatistic(%s) failed", jobID) | |||||
| return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| @@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||||
| m.Post("/del_version", repo.DelTrainJobVersion) | m.Post("/del_version", repo.DelTrainJobVersion) | ||||
| m.Post("/stop_version", repo.StopTrainJobVersion) | m.Post("/stop_version", repo.StopTrainJobVersion) | ||||
| m.Get("/model_list", repo.ModelList) | m.Get("/model_list", repo.ModelList) | ||||
| m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic) | |||||
| }) | }) | ||||
| }) | }) | ||||
| m.Group("/inference-job", func() { | m.Group("/inference-job", func() { | ||||
| @@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) { | |||||
| "PageIsCloudBrain": true, | "PageIsCloudBrain": true, | ||||
| }) | }) | ||||
| } | } | ||||
| func TrainJobGetMetricStatistic(ctx *context.APIContext) { | |||||
| var ( | |||||
| err error | |||||
| ) | |||||
| var jobID = ctx.Params(":jobid") | |||||
| var versionName = ctx.Query("version_name") | |||||
| result, err := trainJobGetMetricStatistic(jobID, versionName) | |||||
| if err != nil { | |||||
| log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error()) | |||||
| return | |||||
| } | |||||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||||
| "JobID": jobID, | |||||
| "Interval": result.Interval, | |||||
| "MetricsInfo": result.MetricsInfo, | |||||
| }) | |||||
| } | |||||
| func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) { | |||||
| task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, err | |||||
| } | |||||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
| if err != nil { | |||||
| log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, err | |||||
| } | |||||
| result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0]) | |||||
| if err != nil { | |||||
| log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, err | |||||
| } | |||||
| return result, err | |||||
| } | |||||