|
|
|
@@ -4,6 +4,7 @@ import ( |
|
|
|
"bufio" |
|
|
|
"encoding/json" |
|
|
|
"errors" |
|
|
|
"fmt" |
|
|
|
"io" |
|
|
|
"net/http" |
|
|
|
"os" |
|
|
|
@@ -392,19 +393,39 @@ func StopJobs(cloudBrains []*models.Cloudbrain) { |
|
|
|
for _, taskInfo := range cloudBrains { |
|
|
|
|
|
|
|
if taskInfo.Type == models.TypeCloudBrainOne { |
|
|
|
err := cloudbrain.StopJob(taskInfo.JobID) |
|
|
|
err := retry(3, time.Second*30, func() error { |
|
|
|
return cloudbrain.StopJob(taskInfo.JobID) |
|
|
|
}) |
|
|
|
|
|
|
|
logErrorAndUpdateJobStatus(err, taskInfo) |
|
|
|
} else { |
|
|
|
param := models.NotebookAction{ |
|
|
|
Action: models.ActionStop, |
|
|
|
} |
|
|
|
_, err := modelarts.StopJob(taskInfo.JobID, param) |
|
|
|
err := retry(3, time.Second*30, func() error { |
|
|
|
_, err := modelarts.StopJob(taskInfo.JobID, param) |
|
|
|
return err |
|
|
|
}) |
|
|
|
logErrorAndUpdateJobStatus(err, taskInfo) |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func retry(attempts int, sleep time.Duration, f func() error) (err error) { |
|
|
|
for i := 0; i < attempts; i++ { |
|
|
|
if i > 0 { |
|
|
|
log.Warn("retrying after error:", err) |
|
|
|
time.Sleep(sleep) |
|
|
|
} |
|
|
|
err = f() |
|
|
|
if err == nil { |
|
|
|
return nil |
|
|
|
} |
|
|
|
} |
|
|
|
return fmt.Errorf("after %d attempts, last error: %s", attempts, err) |
|
|
|
} |
|
|
|
|
|
|
|
func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) { |
|
|
|
if err != nil { |
|
|
|
log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err) |
|
|
|
|