You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

slurm_parser.go 12 kB

3 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. package utils
  2. import (
  3. "bufio"
  4. "fmt"
  5. "os"
  6. "regexp"
  7. "strconv"
  8. "strings"
  9. )
  10. // SlurmResource 定义SLURM资源规格结构体
  11. type SlurmResource struct {
  12. JobName string `json:"job_name"`
  13. CPUs string `json:"cpus"` // 每任务CPU数
  14. Memory string `json:"memory"` // 内存
  15. Nodes string `json:"nodes"` // 节点数
  16. Tasks string `json:"tasks"` // 总任务数
  17. TasksPerNode string `json:"tasks_per_node"` // 每节点任务数
  18. CPUsPerTask string `json:"cpus_per_task"` // 每任务CPU数
  19. Partition string `json:"partition"` // 队列/分区
  20. Time string `json:"time"` // 时间限制
  21. Output string `json:"output"` // 输出文件
  22. Error string `json:"error"` // 错误文件
  23. QOS string `json:"qos"` // 服务质量
  24. Account string `json:"account"` // 账户
  25. GPUs string `json:"gpus"` // GPU数量
  26. GPUType string `json:"gpu_type"` // GPU类型
  27. Constraint string `json:"constraint"` // 节点约束
  28. Exclusive bool `json:"exclusive"` // 独占节点
  29. ArrayJobID string `json:"array_job_id"` // 数组作业ID
  30. WorkingDir string `json:"working_dir"` // 工作目录
  31. MailType string `json:"mail_type"` // 邮件类型
  32. MailUser string `json:"mail_user"` // 邮件用户
  33. }
  34. // SlurmParser SLURM解析器
  35. type SlurmParser struct {
  36. patterns map[string][]*regexp.Regexp
  37. }
  38. // NewSlurmParser 创建新的SLURM解析器
  39. func NewSlurmParser() *SlurmParser {
  40. parser := &SlurmParser{
  41. patterns: make(map[string][]*regexp.Regexp),
  42. }
  43. parser.initPatterns()
  44. return parser
  45. }
  46. // initPatterns 初始化所有匹配模式
  47. func (p *SlurmParser) initPatterns() {
  48. // 作业名称的各种写法
  49. p.patterns["job_name"] = []*regexp.Regexp{
  50. regexp.MustCompile(`#SBATCH\s+--job-name[=\s]+([^\s]+)`),
  51. regexp.MustCompile(`#SBATCH\s+-J\s+([^\s]+)`),
  52. regexp.MustCompile(`#SBATCH\s+--job-name\s*=\s*"([^"]+)"`),
  53. regexp.MustCompile(`#SBATCH\s+--job-name\s*=\s*'([^']+)'`),
  54. }
  55. // CPU相关的各种写法
  56. p.patterns["cpus_per_task"] = []*regexp.Regexp{
  57. regexp.MustCompile(`#SBATCH\s+--cpus-per-task[=\s]+(\d+)`),
  58. regexp.MustCompile(`#SBATCH\s+-c\s+(\d+)`),
  59. regexp.MustCompile(`#SBATCH\s+--cpus-per-task\s*=\s*(\d+)`),
  60. }
  61. // 内存的各种写法
  62. p.patterns["memory"] = []*regexp.Regexp{
  63. regexp.MustCompile(`#SBATCH\s+--mem[=\s]+([^\s]+)`),
  64. regexp.MustCompile(`#SBATCH\s+-m\s+([^\s]+)`),
  65. regexp.MustCompile(`#SBATCH\s+--mem\s*=\s*([^\s]+)`),
  66. regexp.MustCompile(`#SBATCH\s+--mem-per-cpu[=\s]+([^\s]+)`),
  67. regexp.MustCompile(`#SBATCH\s+--mem-per-node[=\s]+([^\s]+)`),
  68. }
  69. // 节点数的各种写法
  70. p.patterns["nodes"] = []*regexp.Regexp{
  71. regexp.MustCompile(`#SBATCH\s+--nodes[=\s]+(\d+)`),
  72. regexp.MustCompile(`#SBATCH\s+-N\s+(\d+)`),
  73. regexp.MustCompile(`#SBATCH\s+--nodes\s*=\s*(\d+)`),
  74. regexp.MustCompile(`#SBATCH\s+--nodes[=\s]+(\d+-\d+)`), // 范围格式
  75. }
  76. // 任务数的各种写法
  77. p.patterns["tasks"] = []*regexp.Regexp{
  78. regexp.MustCompile(`#SBATCH\s+--ntasks[=\s]+(\d+)`),
  79. regexp.MustCompile(`#SBATCH\s+-n\s+(\d+)`),
  80. regexp.MustCompile(`#SBATCH\s+--ntasks\s*=\s*(\d+)`),
  81. }
  82. // 每节点任务数
  83. p.patterns["tasks_per_node"] = []*regexp.Regexp{
  84. regexp.MustCompile(`#SBATCH\s+--ntasks-per-node[=\s]+(\d+)`),
  85. regexp.MustCompile(`#SBATCH\s+--ntasks-per-node\s*=\s*(\d+)`),
  86. }
  87. // 分区/队列的各种写法
  88. p.patterns["partition"] = []*regexp.Regexp{
  89. regexp.MustCompile(`#SBATCH\s+--partition[=\s]+([^\s]+)`),
  90. regexp.MustCompile(`#SBATCH\s+-p\s+([^\s]+)`),
  91. regexp.MustCompile(`#SBATCH\s+--partition\s*=\s*([^\s]+)`),
  92. regexp.MustCompile(`#SBATCH\s+--partition\s*=\s*"([^"]+)"`),
  93. }
  94. // 时间限制的各种写法
  95. p.patterns["time"] = []*regexp.Regexp{
  96. regexp.MustCompile(`#SBATCH\s+--time[=\s]+([^\s]+)`),
  97. regexp.MustCompile(`#SBATCH\s+-t\s+([^\s]+)`),
  98. regexp.MustCompile(`#SBATCH\s+--time\s*=\s*([^\s]+)`),
  99. }
  100. // 输出文件
  101. p.patterns["output"] = []*regexp.Regexp{
  102. regexp.MustCompile(`#SBATCH\s+--output[=\s]+([^\s]+)`),
  103. regexp.MustCompile(`#SBATCH\s+-o\s+([^\s]+)`),
  104. regexp.MustCompile(`#SBATCH\s+--output\s*=\s*([^\s]+)`),
  105. }
  106. // 错误文件
  107. p.patterns["error"] = []*regexp.Regexp{
  108. regexp.MustCompile(`#SBATCH\s+--error[=\s]+([^\s]+)`),
  109. regexp.MustCompile(`#SBATCH\s+-e\s+([^\s]+)`),
  110. regexp.MustCompile(`#SBATCH\s+--error\s*=\s*([^\s]+)`),
  111. }
  112. // 服务质量
  113. p.patterns["qos"] = []*regexp.Regexp{
  114. regexp.MustCompile(`#SBATCH\s+--qos[=\s]+([^\s]+)`),
  115. regexp.MustCompile(`#SBATCH\s+--qos\s*=\s*([^\s]+)`),
  116. }
  117. // 账户
  118. p.patterns["account"] = []*regexp.Regexp{
  119. regexp.MustCompile(`#SBATCH\s+--account[=\s]+([^\s]+)`),
  120. regexp.MustCompile(`#SBATCH\s+-A\s+([^\s]+)`),
  121. regexp.MustCompile(`#SBATCH\s+--account\s*=\s*([^\s]+)`),
  122. }
  123. // GPU相关
  124. p.patterns["gpus"] = []*regexp.Regexp{
  125. regexp.MustCompile(`#SBATCH\s+--gpus[=\s]+([^\s]+)`),
  126. regexp.MustCompile(`#SBATCH\s+--gpus-per-node[=\s]+([^\s]+)`),
  127. regexp.MustCompile(`#SBATCH\s+--gpus-per-task[=\s]+([^\s]+)`),
  128. regexp.MustCompile(`#SBATCH\s+--gres[=\s]+gpu:(\d+)`),
  129. regexp.MustCompile(`#SBATCH\s+--gres[=\s]+gpu:([^:]+):(\d+)`), // gpu类型:数量
  130. }
  131. // 约束条件
  132. p.patterns["constraint"] = []*regexp.Regexp{
  133. regexp.MustCompile(`#SBATCH\s+--constraint[=\s]+([^\s]+)`),
  134. regexp.MustCompile(`#SBATCH\s+-C\s+([^\s]+)`),
  135. regexp.MustCompile(`#SBATCH\s+--constraint\s*=\s*"([^"]+)"`),
  136. }
  137. // 独占节点
  138. p.patterns["exclusive"] = []*regexp.Regexp{
  139. regexp.MustCompile(`#SBATCH\s+--exclusive`),
  140. }
  141. // 数组作业
  142. p.patterns["array"] = []*regexp.Regexp{
  143. regexp.MustCompile(`#SBATCH\s+--array[=\s]+([^\s]+)`),
  144. regexp.MustCompile(`#SBATCH\s+-a\s+([^\s]+)`),
  145. }
  146. // 工作目录
  147. p.patterns["workdir"] = []*regexp.Regexp{
  148. regexp.MustCompile(`#SBATCH\s+--chdir[=\s]+([^\s]+)`),
  149. regexp.MustCompile(`#SBATCH\s+--workdir[=\s]+([^\s]+)`),
  150. regexp.MustCompile(`#SBATCH\s+-D\s+([^\s]+)`),
  151. }
  152. // 邮件通知
  153. p.patterns["mail_type"] = []*regexp.Regexp{
  154. regexp.MustCompile(`#SBATCH\s+--mail-type[=\s]+([^\s]+)`),
  155. }
  156. p.patterns["mail_user"] = []*regexp.Regexp{
  157. regexp.MustCompile(`#SBATCH\s+--mail-user[=\s]+([^\s]+)`),
  158. }
  159. }
  160. // ParseScript 解析SLURM脚本
  161. func (p *SlurmParser) ParseScript(scriptContent string) *SlurmResource {
  162. resource := &SlurmResource{}
  163. scanner := bufio.NewScanner(strings.NewReader(scriptContent))
  164. for scanner.Scan() {
  165. line := strings.TrimSpace(scanner.Text())
  166. // 跳过非SBATCH行和注释行
  167. if !strings.HasPrefix(line, "#SBATCH") {
  168. continue
  169. }
  170. // 处理每个字段
  171. p.parseField(line, "job_name", &resource.JobName)
  172. p.parseField(line, "cpus_per_task", &resource.CPUsPerTask)
  173. p.parseField(line, "memory", &resource.Memory)
  174. p.parseField(line, "nodes", &resource.Nodes)
  175. p.parseField(line, "tasks", &resource.Tasks)
  176. p.parseField(line, "tasks_per_node", &resource.TasksPerNode)
  177. p.parseField(line, "partition", &resource.Partition)
  178. p.parseField(line, "time", &resource.Time)
  179. p.parseField(line, "output", &resource.Output)
  180. p.parseField(line, "error", &resource.Error)
  181. p.parseField(line, "qos", &resource.QOS)
  182. p.parseField(line, "account", &resource.Account)
  183. p.parseField(line, "constraint", &resource.Constraint)
  184. p.parseField(line, "array", &resource.ArrayJobID)
  185. p.parseField(line, "workdir", &resource.WorkingDir)
  186. p.parseField(line, "mail_type", &resource.MailType)
  187. p.parseField(line, "mail_user", &resource.MailUser)
  188. // 处理GPU
  189. p.parseGPU(line, resource)
  190. // 处理exclusive
  191. if p.matchPattern(line, "exclusive") != "" {
  192. resource.Exclusive = true
  193. }
  194. }
  195. // 后处理:推导缺失的信息
  196. p.postProcess(resource)
  197. return resource
  198. }
  199. // parseField 解析单个字段
  200. func (p *SlurmParser) parseField(line, field string, target *string) {
  201. if *target == "" { // 只在字段为空时才设置
  202. if value := p.matchPattern(line, field); value != "" {
  203. *target = value
  204. }
  205. }
  206. }
  207. // parseGPU 解析GPU相关信息
  208. func (p *SlurmParser) parseGPU(line string, resource *SlurmResource) {
  209. if patterns, exists := p.patterns["gpus"]; exists {
  210. for _, pattern := range patterns {
  211. if matches := pattern.FindStringSubmatch(line); len(matches) > 1 {
  212. if strings.Contains(pattern.String(), "gres.*gpu:([^:]+):(\\d+)") && len(matches) > 2 {
  213. // gpu类型:数量格式
  214. resource.GPUType = matches[1]
  215. resource.GPUs = matches[2]
  216. } else {
  217. resource.GPUs = matches[1]
  218. }
  219. break
  220. }
  221. }
  222. }
  223. }
  224. // matchPattern 匹配模式并返回值
  225. func (p *SlurmParser) matchPattern(line, field string) string {
  226. if patterns, exists := p.patterns[field]; exists {
  227. for _, pattern := range patterns {
  228. if matches := pattern.FindStringSubmatch(line); len(matches) > 1 {
  229. return matches[1]
  230. }
  231. }
  232. }
  233. return ""
  234. }
  235. // postProcess 后处理,推导缺失信息
  236. func (p *SlurmParser) postProcess(resource *SlurmResource) {
  237. // 如果没有指定CPUs但有tasks和cpus_per_task,计算总CPU数
  238. if resource.CPUs == "" && resource.Tasks != "" && resource.CPUsPerTask != "" {
  239. if tasks, err1 := strconv.Atoi(resource.Tasks); err1 == nil {
  240. if cpusPerTask, err2 := strconv.Atoi(resource.CPUsPerTask); err2 == nil {
  241. resource.CPUs = strconv.Itoa(tasks * cpusPerTask)
  242. }
  243. }
  244. }
  245. // 如果只有tasks但没有nodes,假设为单节点
  246. if resource.Tasks != "" && resource.Nodes == "" && resource.TasksPerNode == "" {
  247. resource.Nodes = "1"
  248. }
  249. }
  250. // ParseFile 从文件解析SLURM脚本
  251. func (p *SlurmParser) ParseFile(filename string) (*SlurmResource, error) {
  252. content, err := os.ReadFile(filename)
  253. if err != nil {
  254. return nil, fmt.Errorf("读取文件失败: %v", err)
  255. }
  256. return p.ParseScript(string(content)), nil
  257. }
  258. // String 格式化输出
  259. func (r *SlurmResource) String() string {
  260. var result strings.Builder
  261. result.WriteString("SLURM资源规格:\n")
  262. result.WriteString("====================\n")
  263. if r.JobName != "" {
  264. result.WriteString(fmt.Sprintf("作业名称: %s\n", r.JobName))
  265. }
  266. if r.Partition != "" {
  267. result.WriteString(fmt.Sprintf("队列/分区: %s\n", r.Partition))
  268. }
  269. if r.Nodes != "" {
  270. result.WriteString(fmt.Sprintf("节点数: %s\n", r.Nodes))
  271. }
  272. if r.Tasks != "" {
  273. result.WriteString(fmt.Sprintf("任务数: %s\n", r.Tasks))
  274. }
  275. if r.TasksPerNode != "" {
  276. result.WriteString(fmt.Sprintf("每节点任务数: %s\n", r.TasksPerNode))
  277. }
  278. if r.CPUsPerTask != "" {
  279. result.WriteString(fmt.Sprintf("每任务CPU数: %s\n", r.CPUsPerTask))
  280. }
  281. if r.CPUs != "" {
  282. result.WriteString(fmt.Sprintf("总CPU数: %s\n", r.CPUs))
  283. }
  284. if r.Memory != "" {
  285. result.WriteString(fmt.Sprintf("内存: %s\n", r.Memory))
  286. }
  287. if r.GPUs != "" {
  288. result.WriteString(fmt.Sprintf("GPU数量: %s\n", r.GPUs))
  289. if r.GPUType != "" {
  290. result.WriteString(fmt.Sprintf("GPU类型: %s\n", r.GPUType))
  291. }
  292. }
  293. if r.Time != "" {
  294. result.WriteString(fmt.Sprintf("运行时间: %s\n", r.Time))
  295. }
  296. if r.Account != "" {
  297. result.WriteString(fmt.Sprintf("账户: %s\n", r.Account))
  298. }
  299. if r.QOS != "" {
  300. result.WriteString(fmt.Sprintf("服务质量: %s\n", r.QOS))
  301. }
  302. if r.Constraint != "" {
  303. result.WriteString(fmt.Sprintf("节点约束: %s\n", r.Constraint))
  304. }
  305. if r.Exclusive {
  306. result.WriteString("独占节点: 是\n")
  307. }
  308. if r.ArrayJobID != "" {
  309. result.WriteString(fmt.Sprintf("数组作业: %s\n", r.ArrayJobID))
  310. }
  311. if r.Output != "" {
  312. result.WriteString(fmt.Sprintf("输出文件: %s\n", r.Output))
  313. }
  314. if r.Error != "" {
  315. result.WriteString(fmt.Sprintf("错误文件: %s\n", r.Error))
  316. }
  317. return result.String()
  318. }
  319. // GetResourceSummary 获取核心资源摘要
  320. func (r *SlurmResource) GetResourceSummary() map[string]string {
  321. summary := make(map[string]string)
  322. if r.JobName != "" {
  323. summary["job_name"] = r.JobName
  324. }
  325. if r.Partition != "" {
  326. summary["queue"] = r.Partition
  327. }
  328. if r.Nodes != "" {
  329. summary["nodes"] = r.Nodes
  330. }
  331. if r.Tasks != "" {
  332. summary["tasks"] = r.Tasks
  333. }
  334. if r.CPUs != "" {
  335. summary["total_cpus"] = r.CPUs
  336. }
  337. if r.CPUsPerTask != "" {
  338. summary["cpus_per_task"] = r.CPUsPerTask
  339. }
  340. if r.Memory != "" {
  341. summary["memory"] = r.Memory
  342. }
  343. if r.GPUs != "" {
  344. summary["gpus"] = r.GPUs
  345. }
  346. if r.Time != "" {
  347. summary["time_limit"] = r.Time
  348. }
  349. return summary
  350. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.