Browse Source

Merge pull request '智算网络NPU训练任务详情页面新增资源占用情况页签' (#3195) from fix-3105 into V20221116

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/3195
Reviewed-by: liuzx <liuzx@pcl.ac.cn>
tags/v1.22.11.2^2
liuzx 2 years ago
parent
commit
6140f0f508
7 changed files with 65 additions and 96 deletions
  1. +26
    -0
      modules/grampus/resty.go
  2. +1
    -0
      routers/api/v1/api.go
  3. +22
    -0
      routers/repo/grampus.go
  4. +2
    -83
      templates/repo/cloudbrain/trainjob/show.tmpl
  5. +11
    -5
      templates/repo/grampus/trainjob/show.tmpl
  6. +1
    -1
      templates/repo/modelarts/trainjob/show.tmpl
  7. +2
    -7
      web_src/js/index.js

+ 26
- 0
modules/grampus/resty.go View File

@@ -245,6 +245,32 @@ func GetTrainJobLog(jobID string) (string, error) {
return logContent, nil
}

func GetGrampusMetrics(jobID string) (models.GetTrainJobMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobMetricStatisticResult
res, err := client.R().
SetAuthToken(TOKEN).
Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics")

if err != nil {
return result, fmt.Errorf("resty GetTrainJobLog: %v", err)
}
if err = json.Unmarshal([]byte(res.String()), &result); err != nil {
log.Error("GetGrampusMetrics json.Unmarshal failed(%s): %v", res.String(), err.Error())
return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
if res.StatusCode() != http.StatusOK {
log.Error("Call GrampusMetrics failed(%d):%s(%s)", res.StatusCode(), result.ErrorCode, result.ErrorMsg)
return result, fmt.Errorf("Call GrampusMetrics failed(%d):%d(%s)", res.StatusCode(), result.ErrorCode, result.ErrorMsg)
}
if !result.IsSuccess {
log.Error("GetGrampusMetrics(%s) failed", jobID)
return result, fmt.Errorf("GetGrampusMetrics failed:%s", result.ErrorMsg)
}
return result, nil
}

func StopJob(jobID string) (*models.GrampusStopJobResponse, error) {
checkSetting()
client := getRestyClient()


+ 1
- 0
routers/api/v1/api.go View File

@@ -1048,6 +1048,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("", repo.GetModelArtsTrainJobVersion)
m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
m.Get("/log", repo_ext.GrampusGetLog)
m.Get("/metrics", repo_ext.GrampusMetrics)
m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
})
})


+ 22
- 0
routers/repo/grampus.go View File

@@ -957,6 +957,28 @@ func GrampusGetLog(ctx *context.Context) {
return
}

func GrampusMetrics(ctx *context.Context) {
jobID := ctx.Params(":jobid")
job, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
ctx.ServerError(err.Error(), err)
return
}

result, err := grampus.GetGrampusMetrics(job.JobID)
if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
}
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"Interval": result.Interval,
"MetricsInfo": result.MetricsInfo,
})

return
}

func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
var command string



+ 2
- 83
templates/repo/cloudbrain/trainjob/show.tmpl View File

@@ -284,10 +284,7 @@
<div class="content-pad">
<div class="ui pointing secondary menu" style="border-bottom: 1px solid rgba(34,36,38,.15);">
<a class="active item"
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item" data-tab="second{{$k}}"
onclick="javascript:parseInfo()">{{$.i18n.Tr "repo.cloudbrain.runinfo"}}</a>
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>

<a class="item log_bottom" data-tab="third{{$k}}"
data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
@@ -504,25 +501,6 @@

</div>
</div>

<div class="ui tab" data-tab="second{{$k}}">
<div>
<div class="ui message message{{.VersionName}}" style="display: none;">
<div id="header"></div>
</div>
<div class="ui attached log" id="log_state{{.VersionName}}"
style="height: 390px !important; overflow: auto;">
<input type="hidden" id="json_value" value="{{$.result.JobStatus.AppExitDiagnostics}}">
<input type="hidden" id="ExitDiagnostics" value="{{$.ExitDiagnostics}}">
<span id="info_display" class="info_text">

</span>
</div>

</div>

</div>

<div class="ui tab" data-tab="third{{$k}}">
<div class="file-info">
<a id="{{.VersionName}}-log-down"
@@ -922,66 +900,7 @@
$('.secondary.menu .item').tab();
});

let userName
let repoPath
let jobID
let downlaodFlag = {{ $.canDownload }}
let taskID = {{ $.task.ID }}
let realJobName = {{ $.task.JobName }}
$(document).ready(function () {
let url = window.location.href;
let urlArr = url.split('/')
userName = urlArr.slice(-5)[0]
repoPath = urlArr.slice(-4)[0]
jobID = urlArr.slice(-1)[0]
})
function stopBubbling(e) {
e = window.event || e;
if (e.stopPropagation) {
e.stopPropagation(); //阻止事件 冒泡传播
} else {
e.cancelBubble = true; //ie兼容
}
}

function loadLog(version_name) {
document.getElementById("mask").style.display = "block"
let startLine = $('input[name=end_line]').val();
if(startLine==""){
startLine=0;
}
let endLine = $('input[name=end_line]').val();
if(endLine==""){
endLine = 50;
}
$.get(`/${userName}/${repoPath}/cloudbrain/train-job/${jobID}/get_log?endLine=${endLine}&startLine=${startLine}`, (data) => {
$('input[name=end_line]').val(data.EndLine)
$('input[name=start_line]').val(data.StartLine)
$(`#log_file${version_name}`).text(data.Content)
document.getElementById("mask").style.display = "none"
}).fail(function (err) {
console.log(err);
document.getElementById("mask").style.display = "none"
});
}

function refreshStatus(version_name) {
$.get(`/api/v1/repos/${userName}/${repoPath}/cloudbrain/${taskID}?version_name=${versionname}`, (data) => {
// header status and duration
//$(`#${version_name}-duration-span`).text(data.JobDuration)
$(`#${version_name}-status-span span`).text(data.JobStatus)
$(`#${version_name}-status-span i`).attr("class", data.JobStatus)
// detail status and duration
//$('#'+version_name+'-duration').text(data.JobDuration)
$('#' + version_name + '-status').text(data.JobStatus)
loadLog(version_name)


}).fail(function (err) {
console.log(err);
});
stopBubbling(arguments.callee.caller.arguments[0])
}

function parseInfo() {
let jsonValue = document.getElementById("json_value").value;


+ 11
- 5
templates/repo/grampus/trainjob/show.tmpl View File

@@ -238,11 +238,8 @@
<span>
<div style="float: right;">
{{$.CsrfTokenHtml}}
</div>
<div class="ac-display-inblock title_text acc-margin-bottom">

<span class="cti-mgRight-sm">{{TimeSinceUnix1 .CreatedUnix}}</span>
<span class="cti-mgRight-sm">
{{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}}</span>
@@ -260,7 +257,6 @@
<span class="refresh-status" data-tooltip="刷新" style="cursor: pointer;" data-inverted="" data-version="{{.VersionName}}">
<i class="redo icon redo-color"></i>
</span>

</div>
<div style="float: right;">
{{if and ($.canDownload) (ne .Status "WAITING") ($.Permission.CanWrite $.UnitTypeModelManage) }}
@@ -269,7 +265,6 @@
{{else}}
<a class="ti-action-menu-item disabled" id="{{.VersionName}}-create-model">{{$.i18n.Tr "repo.modelarts.create_model"}}</a>
{{end}}

</div>
</span>
</span>
@@ -282,6 +277,9 @@

<a class="active item" data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item log_bottom" data-tab="second{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
{{ if eq $.Spec.ComputeResource "NPU"}}
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/grampus/train-job/{{.JobID}}/metrics">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>
{{end}}
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">
@@ -564,6 +562,14 @@

</div>

</div>
<div class="ui tab" data-tab="four{{$k}}" style="position: relative;">
<i class="ri-refresh-line metric_chart"
style="position: absolute;right: 25%;color:#3291f8;z-index:99;cursor: pointer;"
data-version="{{.VersionName}}"></i>
<div id="metric-{{.VersionName}}" style="height: 260px;width: 870px;">
</div>
</div>
<div class="ui tab" data-tab="third{{$k}}">
<input type="hidden" name="model{{.VersionName}}" value="-1">


+ 1
- 1
templates/repo/modelarts/trainjob/show.tmpl View File

@@ -321,7 +321,7 @@
data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item log_bottom" data-tab="second{{$k}}"
data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/modelarts/train-job/{{.JobID}}/metric_statistics?version_name={{.VersionName}}&statistic_type=each&metrics=">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">


+ 2
- 7
web_src/js/index.js View File

@@ -5071,12 +5071,7 @@ function initcreateRepo() {
initcreateRepo();

function initChartsNpu() {
const url = window.location.href;
const urlArr = url.split("/");
let userName = urlArr.slice(-5)[0];
let repoPath = urlArr.slice(-4)[0];
let jobID = urlArr.slice(-1)[0];

const repoPath = $('.metric_chart').data('path')
let options = {
legend: {
data: [],
@@ -5127,7 +5122,7 @@ function initChartsNpu() {
document.getElementById(`metric-${versionName}`)
);
$.get(
`${window.config.AppSubUrl}/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/metric_statistics?version_name=${versionName}&statistic_type=each&metrics=`,
`${window.config.AppSubUrl}/api/v1/repos/${repoPath}`,
(res) => {
let filterDta = res.MetricsInfo.filter((item) => {
return ![


Loading…
Cancel
Save