diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 810e68d30..d0cf0ff84 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1150,6 +1150,17 @@ type LogFile struct { Name string } +type GetTrainJobMetricStatisticResult struct { + TrainJobResult + Interval int `json:"interval"` //查询的时间间隔,单位为分钟 + MetricsInfo []Metrics `json:"metrics"` //监控详情 +} + +type Metrics struct { + Metric string `json:"metric"` //监控指标项 + Value []string `json:"value"` //获取的监控值的序列,元素为String类型 +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { sess := x.NewSession() defer sess.Close() diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index 2f7d08c35..961e02538 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -1119,3 +1119,44 @@ sendjob: return &result, nil } + +func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobMetricStatisticResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobMetricStatistic(%s) failed", jobID) + return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg) + } + + return &result, nil +} diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 9a05aa8ae..d6d3b001a 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/del_version", repo.DelTrainJobVersion) m.Post("/stop_version", repo.StopTrainJobVersion) m.Get("/model_list", repo.ModelList) + m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic) }) }) m.Group("/inference-job", func() { diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 9e4edea03..c14976282 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) { "PageIsCloudBrain": true, }) } + +func TrainJobGetMetricStatistic(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + + result, err := trainJobGetMetricStatistic(jobID, versionName) + if err != nil { + log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error()) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "Interval": result.Interval, + "MetricsInfo": result.MetricsInfo, + }) +} + +func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) { + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + return nil, err + } + + resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) + return nil, err + } + + result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0]) + if err != nil { + log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error()) + return nil, err + } + + return result, err +}