# # Uptime # - id: "41x0300aaaaa9982" name: "[Host] Непредвиденная перезагрузка хоста {{.Host}}" description: "Непредвиденная перезагрузка хоста {{.Host}}" query: "node_time_seconds - node_boot_time_seconds" labels: integration: general object_tag: "hostname" aggregator: sum group_by: - "hostname" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9982" labels_required: labels_absent: type: "threshold_1level_sym" parameters: level: "600" critical: "true" reverse: "true" priority: 1 # # CPU Utilization # - id: "41x0300aaaaa9991" name: "[Host] Высокая нагрузка CPU хоста {{.Host}} ({{.Value0}}%)" description: "Высокая нагрузка CPU хоста {{.Host}} ({{.Value0}}%)" query: '100 - avg by (hostname, instance, job, group) (irate(node_cpu_seconds_total{mode="idle"}[5m]) * 100)' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9991" labels_required: labels_absent: type: "threshold_2levels_sym" parameters: warning: "60" critical: "85" reverse: "false" priority: 1 - id: "41x0300aaaaa9992" name: "[Host] Высокая загрузка CPU ядром на хосте {{.Host}} ({{.Value2}}%)" description: "Высокая загрузка CPU ядром на хосте {{.Host}} ({{.Value2}}%)" query: 'avg by (hostname, instance, job, group) (irate(node_cpu_seconds_total{mode="system"}[5m]) * 100)' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9992" labels_required: labels_absent: type: "threshold_1level_sym" parameters: level: "50" critical: "true" reverse: "false" priority: 1 - id: "41x0300aaaaa9993" name: "[Host] Высокий IOWAIT на хосте {{.Host}} ({{.Value2}}%)" description: "Высокий IOWAIT на хосте {{.Host}} ({{.Value2}}%)" query: '(avg by (instance, hostname, job, group) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100) * on(instance, hostname, job, group) group_left (nodename) node_uname_info{nodename=~".+"}' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9993" labels_required: labels_absent: type: "threshold_2levels_sym" parameters: warning: "10" critical: "30" reverse: "false" priority: 1 # # Memory utilization # - id: "41x0300aaaaa9994" name: "[Host] Высокая утилизация RAM на хосте {{.Host}} ({{.Value2}}%)" description: "Высокая утилизация RAM на хосте {{.Host}} ({{.Value2}}%)" query: '(100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100))' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9994" labels_required: labels_absent: type: "threshold_2levels_sym" parameters: warning: "80" critical: "95" reverse: "false" priority: 1 # # Filesystem # - id: "41x0300aaaaa9995" name: '[Host] Мало места на диске {{index .Labels "mountpoint"}} на хосте {{.Host}} свободно {{.Value0}}%' description: 'Мало места на диске {{index .Labels "mountpoint"}} на хосте {{.Host}} свободно {{.Value0}}%' # этот запрос выводит информацию по файлам, которые примонтированы с хост машины считаются файловыми системами, и их отображение "зашумляет" вывод # query: '((node_filesystem_avail_bytes{device!~"rootfs"} * 100) / node_filesystem_size_bytes{device!~"rootfs"} and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance, hostname, job, group) group_left (nodename) node_uname_info' # этот запрос фильтрует часть файловых систем, примонтированных с хост машины query: '((node_filesystem_avail_bytes{device!~"rootfs|tmpfs"} * 100) / node_filesystem_size_bytes{device!~"rootfs|tmpfs"} and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance, hostname, job, group) group_left (nodename) node_uname_info' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" - "mountpoint" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9995" labels_required: labels_absent: type: "threshold_2levels_sym" parameters: warning: "20" critical: "10" reverse: "true" priority: 1 - id: "41x0300aaaaa9996" name: '[Host] Мало свободных inodes на хосте {{.Host}} ({{.Value2}}%)' description: 'Мало свободных inodes на хосте {{.Host}} ({{.Value2}}%)' query: '(node_filesystem_files_free{fstype!="msdosfs",fstype!="tmpfs",mountpoint!~"/etc/hostname",mountpoint!~"/etc/resolv.conf",mountpoint!~"/etc/hosts"} / node_filesystem_files{fstype!="msdosfs",fstype!="tmpfs",mountpoint!~"/etc/hostname",mountpoint!~"/etc/resolv.conf",mountpoint!~"/etc/hosts"} * 100 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance, hostname, job, group) group_left (nodename) node_uname_info{nodename=~".+"}' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" - "mountpoint" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9996" labels_required: labels_absent: type: "threshold_1level_sym" parameters: level: "10" critical: "true" reverse: "true" priority: 1 - id: "41x0300aaaaa9997" name: '[Host] Ошибка устройства файловой системы хоста {{.Host}} на устройстве {{index .Labels "device"}}' description: 'Ошибка устройства файловой системы хоста {{.Host}} на устройстве {{index .Labels "device"}}' query: 'node_filesystem_device_error{fstype!~"fuse.xrdp-chansrv|parsecfs|tmpfs|fuse.*"}' labels: integration: general aggregator: sum object_tag: "hostname" group_by: - "hostname" - "mountpoint" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "No data" rules: - id: "42x0300aaaaa9997" labels_required: labels_absent: type: "threshold_1level_sym" parameters: level: "1" critical: "true" reverse: "false" priority: 1 # # Network # # Из-за особенностей выражения этот монитор не будет возвращать группы метрик в состоянии OK, # а только в состоянии critical. Если метрика перешла в состояние NO_DATA, то проблема закрывается # так как сетевой интерфейс вышел из состояния operstate="down", то есть - включился - id: "41x0300aaaaa9998" name: '[Host] Отключился сетевой интерфейс {{index .Labels "device"}} на хосте {{.Host}}' description: 'Отключился сетевой интерфейс {{index .Labels "device"}} на хосте {{.Host}}' query: 'sum by(hostname, device) ( node_network_info{device!~"^(eno[0-9]+|docker[0-9]*|veth[0-9a-f]+|br-[0-9a-f]+|lo|tun[0-9]*|cali[0-9a-f]+|flannel\\.\\d+)$", operstate="up"} * 0) or sum by(hostname, device) (node_network_info{device!~"^(eno[0-9]+|docker[0-9]*|veth[0-9a-f]+|br-[0-9a-f]+|lo|tun[0-9]*|cali[0-9a-f]+|flannel\\.\\d+)$", operstate="down"} * 1)' object_tag: "hostname" labels: integration: general aggregator: sum group_by: - "device" - "hostname" step: 5m rate: 60s metric_ttl: 30m no_data_mode: "close problem" rules: - id: "42x0300aaaaa9998" labels_required: labels_absent: type: "threshold_1level_sym" parameters: level: "0.5" critical: "false" reverse: "false" priority: 1