Browse Source

show metric_statistic backend

tags/v1.22.6.1^2
lewis 3 years ago
parent
commit
b158cdcbeb
4 changed files with 96 additions and 0 deletions
  1. +11
    -0
      models/cloudbrain.go
  2. +41
    -0
      modules/modelarts/resty.go
  3. +1
    -0
      routers/api/v1/api.go
  4. +43
    -0
      routers/api/v1/repo/modelarts.go

+ 11
- 0
models/cloudbrain.go View File

@@ -1150,6 +1150,17 @@ type LogFile struct {
Name string
}

type GetTrainJobMetricStatisticResult struct {
TrainJobResult
Interval int `json:"interval"` //查询的时间间隔,单位为分钟
MetricsInfo []Metrics `json:"metrics"` //监控详情
}

type Metrics struct {
Metric string `json:"metric"` //监控指标项
Value []string `json:"value"` //获取的监控值的序列,元素为String类型
}

func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()


+ 41
- 0
modules/modelarts/resty.go View File

@@ -1119,3 +1119,44 @@ sendjob:

return &result, nil
}

func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobMetricStatisticResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobMetricStatistic(%s) failed", jobID)
return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg)
}

return &result, nil
}

+ 1
- 0
routers/api/v1/api.go View File

@@ -922,6 +922,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/del_version", repo.DelTrainJobVersion)
m.Post("/stop_version", repo.StopTrainJobVersion)
m.Get("/model_list", repo.ModelList)
m.Get("/metric_statistics", repo.TrainJobGetMetricStatistic)
})
})
m.Group("/inference-job", func() {


+ 43
- 0
routers/api/v1/repo/modelarts.go View File

@@ -462,3 +462,46 @@ func ResultList(ctx *context.APIContext) {
"PageIsCloudBrain": true,
})
}

func TrainJobGetMetricStatistic(ctx *context.APIContext) {
var (
err error
)

var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")

result, err := trainJobGetMetricStatistic(jobID, versionName)
if err != nil {
log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error())
return
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"Interval": result.Interval,
"MetricsInfo": result.MetricsInfo,
})
}

func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) {
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
return nil, err
}

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
return nil, err
}

result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0])
if err != nil {
log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error())
return nil, err
}

return result, err
}

Loading…
Cancel
Save