监控
Redis 监控仪表板和告警规则
仪表板
REDIS
模块有三个仪表板。
Redis 概览
Redis Overview:所有 Redis 实例的概览
Redis 集群
Redis Cluster:单个 redis 集群的概览
Redis 实例
Redis Instance:单个 redis 实例的概览
告警规则
Redis 有 6 个预定义的告警规则,定义在 files/prometheus/rules/redis.yml
中。
名称 | 描述 | 级别 |
---|---|---|
RedisDown | Redis 服务器宕机 | Critical |
RedisRejectConn | Redis 实例拒绝连接 | Critical |
RedisRTHigh | Redis 实例响应时间过高 | Warning |
RedisCPUHigh | Redis 实例 CPU 使用率过高 | Warning |
RedisMemHigh | Redis 实例内存使用率过高 | Warning |
RedisQPSHigh | Redis 实例 QPS 过高 | Warning |
#==============================================================#
# Error #
#==============================================================#
# redis down triggers a P0 alert
- alert: RedisDown
expr: redis_up < 1
for: 1m
labels: { level: 0, severity: CRIT, category: redis }
annotations:
summary: "CRIT RedisDown: {{ $labels.ins }} {{ $labels.instance }} {{ $value }}"
description: |
redis_up[ins={{ $labels.ins }}, instance={{ $labels.instance }}] = {{ $value }} == 0
http://g.pigsty/d/redis-instance?from=now-5m&to=now&var-ins={{$labels.ins}}
# redis reject connection in last 5m
- alert: RedisRejectConn
expr: redis:ins:conn_reject > 0
labels: { level: 0, severity: CRIT, category: redis }
annotations:
summary: "CRIT RedisRejectConn: {{ $labels.ins }} {{ $labels.instance }} {{ $value }}"
description: |
redis:ins:conn_reject[cls={{ $labels.cls }}, ins={{ $labels.ins }}][5m] = {{ $value }} > 0
http://g.pigsty/d/redis-instance?from=now-10m&to=now&viewPanel=88&fullscreen&var-ins={{ $labels.ins }}
#==============================================================#
# Latency #
#==============================================================#
# redis avg query response time > 160 µs
- alert: RedisRTHigh
expr: redis:ins:rt > 0.00016
for: 1m
labels: { level: 1, severity: WARN, category: redis }
annotations:
summary: "WARN RedisRTHigh: {{ $labels.cls }} {{ $labels.ins }}"
description: |
pg:ins:query_rt[cls={{ $labels.cls }}, ins={{ $labels.ins }}] = {{ $value }} > 160µs
http://g.pigsty/d/redis-instance?from=now-10m&to=now&viewPanel=97&fullscreen&var-ins={{ $labels.ins }}
#==============================================================#
# Saturation #
#==============================================================#
# redis cpu usage more than 70% for 1m
- alert: RedisCPUHigh
expr: redis:ins:cpu_usage > 0.70
for: 1m
labels: { level: 1, severity: WARN, category: redis }
annotations:
summary: "WARN RedisCPUHigh: {{ $labels.cls }} {{ $labels.ins }}"
description: |
redis:ins:cpu_all[cls={{ $labels.cls }}, ins={{ $labels.ins }}] = {{ $value }} > 60%
http://g.pigsty/d/redis-instance?from=now-10m&to=now&viewPanel=43&fullscreen&var-ins={{ $labels.ins }}
# redis mem usage more than 70% for 1m
- alert: RedisMemHigh
expr: redis:ins:mem_usage > 0.70
for: 1m
labels: { level: 1, severity: WARN, category: redis }
annotations:
summary: "WARN RedisMemHigh: {{ $labels.cls }} {{ $labels.ins }}"
description: |
redis:ins:mem_usage[cls={{ $labels.cls }}, ins={{ $labels.ins }}] = {{ $value }} > 80%
http://g.pigsty/d/redis-instance?from=now-10m&to=now&viewPanel=7&fullscreen&var-ins={{ $labels.ins }}
#==============================================================#
# Traffic #
#==============================================================#
# redis qps more than 32000 for 5m
- alert: RedisQPSHigh
expr: redis:ins:qps > 32000
for: 5m
labels: { level: 2, severity: INFO, category: redis }
annotations:
summary: "INFO RedisQPSHigh: {{ $labels.cls }} {{ $labels.ins }}"
description: |
redis:ins:qps[cls={{ $labels.cls }}, ins={{ $labels.ins }}] = {{ $value }} > 16000
http://g.pigsty/d/redis-instance?from=now-10m&to=now&viewPanel=96&fullscreen&var-ins={{ $labels.ins }}