groups:- name: node_rules  rules:  - record: instance:node_cpu:avg_rate5m    expr: 100 - avg (irate(node_cpu_seconds_total{job="node_prod",mode="idle"}[5m])) by (instance) * 100  - record: instance:node_cpus:count    expr: count by (instance)(node_cpu_seconds_total{mode="idle"})  - record: instance:node_cpu_saturation_load1    expr: node_load1 > on (instance) 2 * count by (instance)(node_cpu_seconds_total{job="node_prod",mode="idle"})  - record: instance:node_memory_usage:percentage    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes{job="node_prod"} * 100  - record: instance:node_memory_swap_io_bytes:sum_rate    expr: 1024 * sum by (instance) (                 (rate(node_vmstat_pgpgin[1m])                 + rate(node_vmstat_pgpgout[1m]))          )  - record: instance:root:node_filesystem_usage:percentage    expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100
groups:- name: node_alerts  rules:  - alert: HighNodeCPU(CPU使用率)    expr: instance:node_cpu:avg_rate5m > 90    for: 60m    labels:      name: CPU      severity: warning    annotations:      summary: 5分钟内的节点平均CPU使用率在至少60分钟内超过90%      description: "CPU使用率过高,5分钟内平均CPU使用率为 {{ humanize $value}}%"  - alert: HighNodeLoad(CPU饱和度)    expr: instance:node_cpu_saturation_load1    for: 5m    labels:      name: Load      severity: warning    annotations:      summary: CPU负载平均数超过了CPU数量      description: CPU平均负载至少5分钟内超过主机CPU数量的两倍  - alert: HighNodeMem(内存使用率)    expr: instance:node_memory_usage:percentage > 95    for: 5m    labels:      name: Memory      severity: warning    annotations:      summary: 使用的内存百分比至少在5分钟内超过95%      description: "内存使用率过高,目前值为{{ humanize $value}}%"  - alert: DiskUsage(磁盘使用量)    expr: instance:root:node_filesystem_usage:percentage > 95    for: 5m    labels:      name: Disk      severity: warning    annotations:      summary: "{{$labels.device}}磁盘使用量超过95%"      description: "{{$labels.instance}}的{{ $labels.mountpoint }}使用量为{{ humanize $value}}%"  - alert: DiskWillFillIn4Hours(线性回归预测磁盘空间将耗尽)    expr: predict_linear(node_filesystem_free_bytes{mountpoint="/"}[1h], 4*3600) < 0    for: 5m    labels:      name: Disk      severity: critical    annotations:      summary: 基于最后一小时的增长历史记录,根文件系统的磁盘空间将在接下来的四小时内耗尽      description: "{{$labels.instance}}挂载在 {{ $labels.mountpoint }}的设备 {{$labels.device}}磁盘空间可能将用完"  - alert: DiskIO(磁盘IO操作耗时)    expr: 100-(avg(irate(node_disk_io_time_seconds_total{job="node_prod"}[1m])) by(instance)* 100) < 40    for: 5m    labels:      name: Disk      severity: critical    annotations:     summary: "{{$labels.instance}}磁盘IO使用率过高,磁盘IO大于60%"     description: "{{$labels.instance}}:磁盘IO空闲百分比为:{{humanize $value}}%"  - alert: InstanceDown(实例已停止响应抓取)    expr: up{job="node_prod"} == 0    for: 10s    labels:      severity: critical    annotations:      summary: Host {{ $labels.instance }} is down!      description: 实例 {{ $labels.instance }}已停止响应抓取  - alert: InstancesDown(作业中至少25%的实例无法响应抓取)    expr:  avg(up{job="node_prod"}) by (job) < 0.75    for: 10s    labels:      severity: critical    annotations:      summary: 作业中25%以上的实例停止响应抓取      description: 作业 {{$labels.job}}中至少25%的实例无法响应抓取  - alert: InstancesGone(UP指标缺失警报)    expr:  absent(up{job="node_prod"})    for: 10s    labels:      severity: critical    annotations:      summary: 节点作业中的UP指标消失      description: 实例 {{ $labels.instance }} 的UP指标消失
groups:- name: network_alerts  rules:  - alert: HostNetwork_receive(网卡接收流量异常)    expr: irate(node_network_receive_bytes_total{job="node_prod",device!~"lo|qb.*|qv.*|tap.*|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1024 / 1024 > 20    for: 1m    labels:      name: Network_receive      severity: warning    annotations:      summary: "{{$labels.instance}} 网卡接收流量异常"      description: "{{$labels.instance}} 网卡{{$labels.device}} 5分钟平均接收流量为 {{ humanize $value }}MB/s"  - alert: hostNetwork_transmit(网卡流出流量异常)    expr: irate(node_network_transmit_bytes_total{job="node_prod",device!~"lo|qb.*|qv.*|tap.*|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1024 / 1024 > 20    for: 1m    labels:      name: Network_transmit      severity: warning    annotations:      summary: "{{$labels.instance}} 网卡流出流量异常"      description: "{{$labels.instance}} 网卡{{$labels.device}} 5分钟平均接收流量为 {{ humanize $value }}MB/s"
groups:- name: SSL证书状态  rules:  - alert: "SSL证书过期警告"    expr: (probe_ssl_earliest_cert_expiry - time())/86400 <3    for: 1h    labels:      severity: 警告    annotations:      description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'      summary: "SSL证书过期警告"
groups:- name: blackbox_network_stats  rules:  - alert: blackbox_network_stats    expr: probe_success == 0    for: 3m    labels:      severity: critical    annotations:      summary: "接口/主机/端口 {{ $labels.instance }}  无法联通"      description: "请尽快检测"
groups:- name: prometheus_alerts  rules:  - alert: PrometheusConfigReloadFailed(Prometheus配置重载失败)    expr: prometheus_config_last_reload_successful == 0    for: 10m    labels:      severity: warning    annotations:      description: Reloading Prometheus' configuration has failed on {{ $labels.instance }}.  - alert: PrometheusNotConnectedToAlertmanagers(Prometheus没有发现任何Alertmanager)    expr: prometheus_notifications_alertmanagers_discovered < 1    for: 10m    labels:      severity: warning    annotations:      description: Prometheus {{ $labels.instance }} is not connected to any Alertmanagers
©著作权归作者所有:来自51CTO博客作者辉晖飛的原创作品,如需转载,请注明出处,否则将追究法律责任

更多相关文章

  1. Linux基础-18day-Linux系统磁盘管理(du/df/mount命令)
  2. Linux运维教程-Linux系统磁盘管理
  3. 磁盘空间-逻辑卷
  4. 划分分区组成逻辑卷和扩展根分区
  5. 2021-2-23:Java 文件映射内存是如何更新到硬盘文件的,什么情况下会
  6. 操作系统——计算机硬件简介
  7. 2.33 在windows 2012中创建iSCSI虚拟磁盘和iSCSI目标
  8. 只有1个字节的文件实际占用多少磁盘空间
  9. 新建一个空文件占用多少磁盘空间?

随机推荐

  1. android EditText TextView的属性整理 最
  2. Android平板开发注意点
  3. android RelativeLayout属性和布局实例
  4. android surfaceView 概述
  5. Android实现3个圆圈的动画
  6. Android简易注解View(java反射实现)
  7. android解析json小例子
  8. AndroidManifest.xml之 element详解
  9. android讲义9之向电话本进行批处理的插入
  10. cocos2d-x环境搭建 for eclipse