监控
监控 MinIO 集群
仪表板
MINIO
模块提供一个仪表板。
MinIO Overview:单个 MinIO 集群的概览
告警规则
为 MinIO 预定义了 3 个告警规则,定义在 files/prometheus/rules/minio.yml
中:
MinioServerDown
MinioNodeOffline
MinioDiskOffline
#==============================================================#
# Aliveness #
#==============================================================#
# MinIO server instance down
- alert: MinioServerDown
expr: minio_up < 1
for: 1m
labels: { level: 0, severity: CRIT, category: minio }
annotations:
summary: "CRIT MinioServerDown {{ $labels.ins }}@{{ $labels.instance }}"
description: |
minio_up[ins={{ $labels.ins }}, instance={{ $labels.instance }}] = {{ $value }} < 1
http://g.pigsty/d/minio-overview
#==============================================================#
# Error #
#==============================================================#
# MinIO node offline triggers a p1 alert
- alert: MinioNodeOffline
expr: avg_over_time(minio_cluster_nodes_offline_total{job="minio"}[5m]) > 0
for: 3m
labels: { level: 1, severity: WARN, category: minio }
annotations:
summary: "WARN MinioNodeOffline: {{ $labels.cls }} {{ $value }}"
description: |
minio_cluster_nodes_offline_total[cls={{ $labels.cls }}] = {{ $value }} > 0
http://g.pigsty/d/minio-overview?from=now-5m&to=now&var-cls={{$labels.cls}}
# MinIO disk offline triggers a p1 alert
- alert: MinioDiskOffline
expr: avg_over_time(minio_cluster_disk_offline_total{job="minio"}[5m]) > 0
for: 3m
labels: { level: 1, severity: WARN, category: minio }
annotations:
summary: "WARN MinioDiskOffline: {{ $labels.cls }} {{ $value }}"
description: |
minio_cluster_disk_offline_total[cls={{ $labels.cls }}] = {{ $value }} > 0
http://g.pigsty/d/minio-overview?from=now-5m&to=now&var-cls={{$labels.cls}}