|
- train-job:
- docker-dataset-path: "/dataset"
-
- k8s:
- # k8s集群配置文件
- kubeconfig: kubeconfig_test
- # nfs服务暴露的IP地址 如需测试需修改为合适的地址
- nfs: 127.0.0.1
- #nfs服务端 共享目录
- nfs-root-path: /nfs/
- nfs-root-windows-path: "Z:"
- # 文件存储服务暴露的IP地址 如需测试需修改为合适的地址
- file-store: 127.0.0.1
- #文件存储服务端 共享目录
- file-store-root-path: /nfs/
- file-store-root-windows-path: "Z:"
- # 命名空间关键字
- namespace: namespace
- # k8s ingress域名 如需测试需修改为合适的域名,注意:需要为此域名配置泛域名解析到 k8s集群master节点的ip
- host: notebooktest.dubhe.club
- # k8s ingress-controller 对外port
- port: 30865
- # k8s ingress-controller 对外grpc port
- https-port: 31287
- # k8s 模型部署配置
- serving:
- # 在线服务模型部署后容器域名(k8s ingress域名),解析地址为k8s集群地址,如需测试需修改为合适的域名
- host: servingtest.dubhe.club
- # tls 证书 crt
- tls-crt:
- # tls 证书 key
- tls-key:
- # elasticsearch暴露的服务地址
- elasticsearch:
- hostlist: ${eshostlist:127.0.0.1:30498}
- # 日志采集配置信息
- log:
- type: _doc
- # 过滤源字段
- source_field: log,@timestamp,kubernetes.pod_name
- # 异步回调
- callback:
- # boot 单机部署即回调本机实例
- url: localhost:${server.port}
- token:
- # 秘钥
- secret-key: 1qaz2wsx
- # 过期时间(秒)
- expire-seconds: 300
- # 展示Pod的CPU使用率,Memory使用量,GPU使用率的grafana地址
- pod:
- metrics:
- grafanaUrl: http://127.0.0.1:30006/d/job/monitor?orgId=1&refresh=5s&kiosk&var-pod=
- prometheus:
- url: http://127.0.0.1:30003/
- query: api/v1/query
- query-range: api/v1/query_range
- gpu-query-param: sum(DCGM_FI_DEV_GPU_UTIL{pod="pod-name-placeholder"})by(pod,UUID)
- gpu-mem-total-query-param: sum(DCGM_FI_DEV_FB_TOTAL_MEGABYTES{pod="pod-name-placeholder"})by(pod,UUID)
- gpu-mem-use-query-param: sum(DCGM_FI_DEV_FB_USED{pod="pod-name-placeholder"})by(pod,UUID)
- cpu-range-query-param: sum(rate(container_cpu_usage_seconds_total{image!="",pod="pod-name-placeholder"}[1m])) by (pod) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod)) * 100
- mem-range-query-param: sum(container_memory_rss{image!="",pod="pod-name-placeholder"})
- gpu-usage-query-param: sum by(Hostname,gpu)(DCGM_FI_DEV_GPU_UTIL{Hostname="node-name-placeholder",pod!=""})
- gpu-range-query-param: sum(DCGM_FI_DEV_GPU_UTIL{pod="pod-name-placeholder"}) by (pod,UUID)
- gpu-mem-total-range-query-param: sum(DCGM_FI_DEV_FB_TOTAL_MEGABYTES{pod="pod-name-placeholder"}) by (pod,UUID)
- gpu-mem-use-range-query-param: sum(DCGM_FI_DEV_FB_USED{pod="pod-name-placeholder"}) by (pod,UUID)
- gpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:DCGM_FI_DEV_GPU_UTIL:sumn[usage-rate-day])))
- cpu-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten[usage-rate-day])))
- mem-usage-rate-query-param: topk(10,sort_desc(max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace.*"}[usage-rate-day])))
- gpu-usage-namespace-query-param: max_over_time(namespace:DCGM_FI_DEV_GPU_UTIL:sumn{namespace=~"namespace-placeholder"}[usage-rate-day])
- cpu-usage-namespace-query-param: max_over_time(namespace:container_cpu_user_seconds_total_sumn:raten{namespace=~"namespace-placeholder"}[usage-rate-day])
- mem-usage-namespace-query-param: max_over_time(namespace:CONTAINER_MEMERY_USAGE_BYTES:sumn{namespace=~"namespace-placeholder"}[usage-rate-day])
- nfs-storage-class-name: zjlab-nfs-storage
- namespace-limits:
- cpu: 10
- memory: 32
- gpu: 2
- #配置harbor
- harbor:
- address: harbor.dubhe.ai
- username: admin
- password: Harbor12345
- model-name: train
-
- # minio配置
- minio:
- url: http://127.0.0.1:9000/
- accessKey: admin
- secretKey: 123@abc.com
- bucketName: dubhe-prod
- presignedUrlExpiryTime: 300
- annotation: /annotation/
-
- docker:
- remote-api-port: 2375
|