Перейти к содержанию

Prometheus

Установка Prometheus server и Node exporter#

Prometheus server install#

  • wget https://github.com/prometheus/prometheus/releases/download/v2.55.0-rc.0/prometheus-2.55.0-rc.0.linux-amd64.tar.gz - скачиваем прометеус сервер
  • tar xvfz *.tar.gz
  • cd prometheus-2.55.0-rc.0.linux-amd64.tar.gz
  • sudo mv prometheus /usr/local/bin/
  • sudo mv promtool /usr/local/bin
  • sudo mkdir /etc/prometheus/
  • sudo mkdir /etc/prometheus/data
  • sudo mv prometheus.yml /etc/prometheus/
# prometheus.yml
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"

scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]

  - job_name: "archers-paradox-servers"
    static_configs:
      - targets:
          - 89.22.241.241:9100
  • useradd -rs /bin/false prometheus - создаём системного пользователя для работы с prometheus
  • chown prometheus:prometheus /usr/local/bin/prometheus
  • chown -R prometheus:prometheus /etc/prometheus
  • vi /etc/systemd/system/prometheus.service - создаём systemd юнит
[Unit]
Description=Prometheus Server
After=network.target

[Service]
User=prometheus
Group=prometheus
Type=simple
Restart=on-failure
RestartSec=4s
ExecStart=/usr/local/bin/prometheus \
  --config.file       /etc/prometheus/prometheus.yml \
  --storage.tsdb.path /etc/prometheus/data

[Install]
WantedBy=multi-user.target
  • sudo systemctl daemon-reload - обновить systemd
  • sudo systemctl start prometheus
  • sudo systemctl enable prometheus

Настройка TLS#

  • touch web-config.yml - создать конфигурационный файл для настойки TLS

  • Добавить в Unit-файл пусть к конфигу TLS в директиве ExecStart

    ExecStart=/usr/local/bin/prometheus \
      --config.file       /etc/prometheus/prometheus.yml \
      --storage.tsdb.path /etc/prometheus/data \
      --web.config.file /etc/prometheus/web-config.yml
    

  • certbot certonly --nginx -d prometheus.mcarov.pro - сгенирировать сертификаты, команду не хочу пояснять

Если на сервере нет Nginx

certbot certonly --standalone -d prometheus.mcarov.pro

  • Добавить Серт и Ключ в web-config.yml

    tls_server_config:
      cert_file: /home/prometheus/certs/example.com/example.com.crt
      key_file: /home/prometheus/certs/example.com/example.com.key
    

  • Добавить строку в /etc/crontab для обновления серта

    0 0,12 * * * root /opt/certbot/bin/python -c 'import random; import time; time.sleep(random.random() * 3600)' && sudo certbot renew -q
    

  • Создать скрипт /etc/letsencrypt/renewal-hooks/deploy/set-permissions-and-restart-prometheus.sh

    #!/bin/bash
    # Выдать права пользователю prometheus
    setfacl -R -m u:prometheus:rX /etc/letsencrypt/
    
    # Перезагрузить Prometheus
    systemctl restart prometheus
    

  • chmod +x /etc/letsencrypt/renewal-hooks/deploy/set-permissions-and-restart-prometheus.sh

  • certbot renew --dry-run - для отладки процесса обновления сертификатов

  • Создать конфиг nginx в /etc/nginx/sites-available

    server {
        server_name prometheus.mcarov.pro;
    
        location / {
            client_max_body_size 512M;
            proxy_pass https://127.0.0.1:9090;
            proxy_set_header Host $host;
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_set_header X-Forwarded-Proto $scheme;
        }
    
        access_log /var/log/nginx/prometheus.mcarov.pro.access.log;
        error_log /var/log/nginx/prometheus.mcarov.pro.error.log;
    
        listen 443 ssl;
        ssl_certificate /etc/letsencrypt/live/prometheus.mcarov.pro/fullchain.pem;
        ssl_certificate_key /etc/letsencrypt/live/prometheus.mcarov.pro/privkey.pem;
        include /etc/letsencrypt/options-ssl-nginx.conf;
        ssl_dhparam /etc/letsencrypt/ssl-dhparams.pem;
    
    }
    
    server {
        if ($host = prometheus.mcarov.pro) {
            return 301 https://$host$request_uri;
        }
    
    
        listen 80;
        server_name prometheus.mcarov.pro;
        return 404;
    
    }
    

  • ln -s /etc/nginx/sites-available/prometheus.mcarov.pro /etc/nginx/sites-enabled/
  • nginx -t
  • systemctl reload nginx

Аутентификация для Prometheus Server#

  • htpasswd -nBC 10 "" | tr -d ':\n' - выполнить

  • Добавить в web-config.yml

    basic_auth_users:
      # user: password (hash)
      admin: $2y$10$QzpQ2fO9TpU1Hm4VbB6AMO8ZsdoplfesfAmI8MFB402BVIu5gf.TK
    

  • systemctl restart prometheus

  • В UI Grafana настроить Basic auth в Data Source

Node exporter install#

  • wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-arm64.tar.gz - скачиваем node exporter на удалённый хост

  • tar xvfz *.tar.gz

  • sudo mv node_exporter /usr/local/bin/

  • sudo useradd -rs /bin/false node_exporter

  • sudo chown node_exporter:node_exporter /usr/local/bin/node_exporter
  • sudo vi /etc/systemd/system/node_exporter.service
# node_exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target 

[Service]
User=node_exporter
Group=node_exporter
Type=simple
Restart=on-failure
RestartSec=4s
ExecStart=/usr/local/bin/node_exporter

[Install]
WantedBy=multi-user.target
  • sudo systemctl daemon-reload
  • sudo systemctl start node_exporter
  • sudo systemctl enable node_exporter

Настройка TLS#

  • touch web.yml
  • Создать openssl.cnf и наполнить содержимым:

    [req]
    default_bits = 2048
    prompt = no
    default_md = sha256
    distinguished_name = dn
    
    [dn]
    CN = 150.241.66.94
    
    [v3_ext]
    authorityKeyIdentifier=keyid,issuer:always
    basicConstraints=CA:FALSE
    keyUsage=digitalSignature,keyEncipherment
    extendedKeyUsage=serverAuth
    subjectAltName=@alt_names
    
    [alt_names]
    IP.1 = 150.241.66.94
    

  • sudo openssl req -x509 -newkey rsa:2048 -keyout node_exporter.key -out node_exporter.pem -days 36500 -nodes -config openssl.cnf -extensions v3_ext - генирируем серты

  • Не забываем выдать права на сертификаты

  • Добавить в web.yml

    tls_server_config:
      cert_file: /etc/prometheus/node_exporter/certs/node_exporter.pem
      key_file: /etc/prometheus/node_exporter/certs/node_exporter.key
    

  • Добавить в Unit Node Exporter в директиву ExecStart

    --web.config.file=/etc/prometheus/node_exporter/web.yml
    

  • systemctl daemon-reload

  • systemctl restart node_exporter.service

  • На машине с Prometheus Server добавить в prometheus.yml в нужную джобу экспортера

      scheme: https
      tls_config:
        ca_file: /etc/prometheus/node_exporter/srv-infra/certs/node_exporter.pem
    

  • systemctl restart prometheus

Про PromQL#

  • increase() возвращает общий прирост за указанный интервал времени.
    increase(<метрика>[<интервал>])
    
  • rate() вычисляет скорость изменения за этот же интервал времени, и в результате возвращает количество изменений в секунду.
  • increase(http_requests_total[5m]) — покажет, сколько запросов пришло за последние 5 минут.
  • rate(http_requests_total[5m]) — покажет среднее количество запросов в секунду за последние 5 минут.

Полезности#

  • promtool check config prometheus.yml- проверка синтаксиса
  • promtool check rules rules.yml - проверка синтаксиса
  • promtool test rules rules_test.yml

Telegraf#

  • telegraf --test --config /etc/telegraf/telegraf.conf --input-filter tail - запустить телеграф в режиме отладки

PostgreSQL Exporter#

  • Создать пользователя для мониторинга

    CREATE USER postgres_exporter WITH PASSWORD 'password';
    ALTER USER postgres_exporter SET SEARCH_PATH TO pg_catalog;
    GRANT CONNECT ON DATABASE postgres TO postgres_exporter;
    GRANT USAGE ON SCHEMA pg_catalog  TO postgres_exporter;
    GRANT EXECUTE ON FUNCTION pg_ls_waldir TO postgres_exporter;
    GRANT pg_read_all_stats TO postgres_exporter;
    

  • Установить Exporter

    wget https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-amd64.tar.gz
    tar -xzvf postgres_exporter-0.17.1.linux-amd64.tar.gz
    rm postgres_exporter-0.17.1.linux-amd64.tar.gz
    cd postgres_exporter-0.17.1.linux-amd64/
    mv postgres_exporter /usr/local/bin/
    cd ..
    rm -rf postgres_exporter-0.17.1.linux-amd64/
    chmod +x /usr/local/bin/postgres_exporter
    /usr/local/bin/postgres_exporter --version
    

  • Добавить в postgresql.conf

    ssl = on
    ssl_cert_file = '/etc/postgresql/ssl/server.crt'
    ssl_key_file = '/etc/postgresql/ssl/server.key'
    ssl_ca_file = '/etc/postgresql/ssl/root.crt'
    

  • Сгенирировать серты, включая корневой (самоподписные)

    mkdir -p /etc/postgresql/ssl
    chown postgres:postgres /etc/postgresql/ssl
    chmod 700 /etc/postgresql/ssl
    
    openssl genrsa -out /etc/postgresql/ssl/root.key 4096
    openssl req -x509 -new -nodes -key /etc/postgresql/ssl/root.key -sha256 -days 3650 \
      -out /etc/postgresql/ssl/root.crt \
      -subj "/CN=PostgreSQL Root CA"
    chmod 600 /etc/postgresql/ssl/root.key
    
    
    openssl genrsa -out /etc/postgresql/ssl/server.key 2048
    openssl req -new -key /etc/postgresql/ssl/server.key -out /etc/postgresql/ssl/server.csr \
      -subj "/CN=$(hostname)"
    openssl x509 -req -in /etc/postgresql/ssl/server.csr -CA /etc/postgresql/ssl/root.crt \
      -CAkey /etc/postgresql/ssl/root.key -CAcreateserial -out /etc/postgresql/ssl/server.crt \
      -days 3650 -sha256
    chmod 600 /etc/postgresql/ssl/server.key
    chown postgres:postgres /etc/postgresql/ssl/server.*
    
    systemctl restart postgresql
    

  • Ограничить в pg_hba.conf доступ для пользователя postgres_exporter

    hostssl  postgres  postgres_exporter  127.0.0.1/32  scram-sha-256
    

  • Конфигурируем клиент

    useradd -r -s /bin/false postgres_exporter
    chown -R postgres_exporter:postgres_exporter /etc/postgres_exporter 
    chmod 600 /etc/postgres_exporter/*
    
    mkdir -p /etc/postgres_exporter
    cp /etc/postgresql/ssl/root.crt /etc/postgres_exporter/
    chmod 700 /etc/postgres_exporter
    chmod 400 /etc/postgres_exporter/root.crt
    

  • Конфигурируем systemd

    # Файл /etc/postgres_exporter/env
    DATA_SOURCE_NAME=postgresql://postgres_exporter:password@127.0.0.1:5432/postgres?sslmode=verify-ca&sslrootcert=/etc/postgres_exporter/root.crt
    
    # Файл /etc/systemd/system/postgres_exporter.service
    [Unit]
    Description=PostgreSQL Exporter for Prometheus
    After=network.target
    
    [Service]
    User=postgres_exporter
    Group=postgres_exporter
    EnvironmentFile=/etc/postgres_exporter/env
    ExecStart=/usr/local/bin/postgres_exporter
    
    [Install]
    WantedBy=multi-user.target
    

  • Добавить в prometheus.yml

    - job_name: 'postgres_exporter'
      static_configs:
        - targets: ['localhost:9187']
    

systemctl restart prometheus

MariaDB Exporter#

На сервере MariaDB#

  • Создать файл openssl.cnf

    [ req ]
    prompt             = no
    distinguished_name = req_distinguished_name
    req_extensions     = v3_req
    
    [ req_distinguished_name ]
    CN = 89.22.228.13
    
    [ v3_req ]
    keyUsage         = keyEncipherment, dataEncipherment
    extendedKeyUsage = serverAuth
    subjectAltName   = @alt_names
    
    [ alt_names ]
    DNS.1 = mariadb.mcarov.pro
    IP.1  = 89.22.228.13
    

  • Создаём серты

    mkdir -p /etc/mysql/ssl
    
    openssl req -x509 -new -nodes -days 3650 \
      -subj "/CN=MyMariaDB-CA" \
      -keyout /etc/mysql/ssl/ca-key.pem \
      -out /etc/mysql/ssl/ca.pem \
      -sha256 -days 3650
    
    openssl req -new -nodes -newkey rsa:2048 \
      -keyout /etc/mysql/ssl/server-key.pem \
      -out /etc/mysql/ssl/server.csr \
      -config openssl.cnf
    
    openssl x509 -req -in /etc/mysql/ssl/server.csr \
      -CA /etc/mysql/ssl/ca.pem -CAkey /etc/mysql/ssl/ca-key.pem \
      -CAcreateserial \
      -out /etc/mysql/ssl/server-cert.pem \
      -days 3650 -sha256 \
      -extensions v3_req -extfile openssl.cnf
    

  • В конфиге MariaDB

    [mysqld]
    ssl-ca=/etc/mysql/ssl/ca.pem
    ssl-cert=/etc/mysql/ssl/server-cert.pem
    ssl-key=/etc/mysql/ssl/server-key.pem
    
    sudo systemctl restart mariadb
    

  • Создать в базе пользователя экспортера

    CREATE USER 'mariadb_exporter'@'192.109.139.92' IDENTIFIED BY 'mariadb_exporter';
    GRANT SELECT, PROCESS, REPLICATION CLIENT, RELOAD ON *.* TO 'mariadb_exporter'@'192.109.139.92' IDENTIFIED BY 'mariadb_exporter';
    FLUSH PRIVILEGES;
    

На сервере Exporter#

  • Ставим экспортер на вм, где есть Pronetheus Server

    wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.17.2/mysqld_exporter-0.17.2.linux-amd64.tar.gz
    tar -xzvf mysqld_exporter-0.17.2.linux-amd64.tar.gz
    rm mysqld_exporter-0.17.2.linux-amd64.tar.gz
    mv mysqld_exporter-0.17.2.linux-amd64/mysqld_exporter /usr/local/bin
      chmod +x /usr/local/bin/mysqld_exporter
    

  • Создаём пользователя для экспортера

    useradd -r -s /usr/sbin/nologin mariadb_exporter
    mkdir -p /etc/mariadb_exporter
    chown -R mariadb_exporter:mariadb_exporter /etc/mariadb_exporter
    

  • Создать /etc/mariadb_exporter/.my.cnf

    [client]
    user=mariadb_exporter
    password=mariadb_exporter
    host=89.22.228.14
    ssl-ca=/etc/mariadb_exporter/ca.pem
    ssl-verify-server-cert
    
    chown -R mariadb_exporter:mariadb_exporter /etc/mariadb_exporter
    

  • Создаём Unit-файл

    [Unit]
    Description=Prometheus MariaDB Exporter
    After=network.target
    
    [Service]
    User=mariadb_exporter
    Group=mariadb_exporter
    ExecStart=/usr/local/bin/mysqld_exporter \
      --config.my-cnf=/etc/mariadb_exporter/.my.cnf
    Restart=on-failure
    
    [Install]
    WantedBy=multi-user.target
    
    systemctl daemon-reload
    systemctl enable --now mariadb_exporter
    

  • Добавляем в prometheus.yml

    - job_name: 'mariadb_exporter'
      static_configs:
        - targets: ['localhost:9104']
    

  • Проверка

    curl http://localhost:9104/metrics
    

Alerts#

Устанавливаем Alertmanager#

useradd -r -s /usr/sbin/nologin alertmanager

wget https://github.com/prometheus/alertmanager/releases/download/v0.28.1/alertmanager-0.28.1.linux-amd64.tar.gz
tar -xzvf alertmanager-*.tar.gz
mv alertmanager-*/alertmanager /usr/local/bin/
mv alertmanager-*/amtool /usr/local/bin/
chown alertmanager:alertmanager /usr/local/bin/alertmanager 
chown alertmanager:alertmanager /usr/local/bin/amtool
rm -rf alertmanager-*
  • Подготовка

    mkdir -p /etc/alertmanager
    mkdir -p /var/lib/alertmanager
    chown -R alertmanager:alertmanager /etc/alertmanager /var/lib/alertmanager
    

  • Файл /etc/alertmanager/alertmanager.yml

    global:
      resolve_timeout: 5m
    
    route:
      group_by: ['alertname']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 1h
      receiver: 'default-receiver'
    
    receivers:
    - name: 'default-receiver'
      webhook_configs:
      - url: 'http://localhost:9093/-/healthy'  # временный URL для теста
        send_resolved: true
    

  • Файл /etc/systemd/system/alertmanager.service

    [Unit]
    Description=Alertmanager
    Wants=network-online.target
    After=network-online.target
    
    [Service]
    User=alertmanager
    Group=alertmanager
    Type=simple
    ExecStart=/usr/local/bin/alertmanager \
      --config.file=/etc/alertmanager/alertmanager.yml \
      --storage.path=/var/lib/alertmanager
    Restart=always
    
    [Install]
    WantedBy=multi-user.target
    
    systemctl daemon-reload
    systemctl enable --now alertmanager
    systemctl status alertmanager  
    

HTTPS Alertmanager#

  • vi /etc/nginx/sites-available/alertmanager.mcarov.pro

    server {
        listen 80;
        server_name alertmanager.mcarov.pro;
    
        location / {
            proxy_pass http://127.0.0.1:9093;
            proxy_set_header Host $host;
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_set_header X-Forwarded-Proto $scheme;
        }
    }
    
    ln -s /etc/nginx/sites-available/alertmanager.mcarov.pro /etc/nginx/sites-enabled/
    nginx -t
    systemctl reload nginx
    
    certbot --nginx -d alertmanager.mcarov.pro
    

  • Аутентификация

    htpasswd -c /etc/nginx/alertmanager/.htpasswd admin
    
    # Добавить в кофиг nginx
    auth_basic "Alertmanager";
    auth_basic_user_file /etc/nginx/alertmanager/.htpasswd;
    
    nginx -t
    systemctl reload nginx
    

Настройка алертов#

  • Добавить Alertmanager в prometheus.yml

    alerting:
      alertmanagers:
        - static_configs:
            - targets: ['localhost:9093']
    
    systemctl restart prometheus
    

  • Проверка

    # Должен вернуть ОК
    curl http://localhost:9093/-/healthy
    amtool check-config /etc/alertmanager/alertmanager.yml
    

  • Создай бота в телеграм, создай чат, добавь его в чат

  • Получи chat-id

В чате должно быть минимум 1 сообщения

curl "https://api.telegram.org/bot<BOT_TOKEN>/getUpdates" | jq

  • /etc/alertmanager/templates/telegram.tmpl

    {{ define "telegram.critical.message" }}
    {{ if eq .Status "firing" }}
    🔥 *[CRITICAL ALERT]* {{ .CommonLabels.alertname }}
    📌 **Instance**: {{ .CommonLabels.instance }}
    🕒 **Firing since**: {{ (.Alerts.Firing | first).StartsAt.Format "2006-01-02 15:04:05" }}
    📝 **Summary**: {{ .CommonAnnotations.summary }}
    
    {{ .CommonAnnotations.description }}
    {{ if .CommonAnnotations.runbook }}🔗 **Runbook**: {{ .CommonAnnotations.runbook }}{{ end }}
    {{ else }}
    ✅ *[CRITICAL RESOLVED]* {{ .CommonLabels.alertname }}
    📌 **Instance**: {{ .CommonLabels.instance }}
    🕒 **Resolved at**: {{ (.Alerts.Resolved | first).EndsAt.Format "2006-01-02 15:04:05" }}
    📝 **Summary**: {{ .CommonAnnotations.summary }}
    {{ end }}
    {{ end }}
    
    {{ define "telegram.warning.message" }}
    {{ if eq .Status "firing" }}
    ⚠️ *[WARNING]* {{ .CommonLabels.alertname }}
    
    {{ .CommonAnnotations.summary }}
    **Details**: {{ .CommonAnnotations.description }}
    {{ else }}
    ✅ *[WARNING RESOLVED]* {{ .CommonLabels.alertname }}
    📝 {{ .CommonAnnotations.summary }}
    {{ end }}
    {{ end }}
    
    {{ define "telegram.db.message" }}
    {{ if eq .Status "firing" }}
    🛠 *[DB ALERT]* {{ .CommonLabels.alertname }} ({{ .CommonLabels.service }})
    
    {{ .CommonAnnotations.description }}
    **Action required**: {{ .CommonAnnotations.runbook }}
    {{ else }}
    ✅ *[DB ALERT RESOLVED]* {{ .CommonLabels.alertname }} ({{ .CommonLabels.service }})
    📝 {{ .CommonAnnotations.summary }}
    {{ end }}
    {{ end }}
    
    {{ define "telegram.default.message" }}
    {{ if eq .Status "firing" }}
    ℹ️ *[ALERT]* {{ .CommonLabels.alertname }}
    
    {{ .CommonAnnotations.summary }}
    {{ .CommonAnnotations.description }}
    {{ else }}
    ✅ *[ALERT RESOLVED]* {{ .CommonLabels.alertname }}
    📝 {{ .CommonAnnotations.summary }}
    {{ end }}
    {{ end }}
    

  • /etc/alertmanager/alertmanager.yml

    global:
      resolve_timeout: 5m
      http_config:
        follow_redirects: true
    
    templates:
      - '/etc/alertmanager/templates/*.tmpl'
    
    route:
      group_by: ['alertname', 'severity']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 4h
      receiver: 'telegram-default'
    
      routes:
        - match:
            severity: 'critical'
          receiver: 'telegram-critical'
          continue: false
          group_interval: 15m
          repeat_interval: 2h
    
        - match:
            severity: 'warning'
          receiver: 'telegram-warnings'
          group_interval: 1h
          repeat_interval: 12h
    
        - match_re:
            service: 'mysql|postgres|influx'
          receiver: 'telegram-db-team'
    
    inhibit_rules:
    - source_match:
        severity: 'critical'
      target_match:
        severity: 'warning'
      equal: ['alertname']
    
    receivers:
    - name: 'telegram-critical'
      telegram_configs:
      - bot_token: 'BOT_TOKEN'
        chat_id: CHAT_ID
        parse_mode: 'Markdown'
        message: '{{ template "telegram.critical.message" . }}'
        send_resolved: true
    
    - name: 'telegram-warnings'
      telegram_configs:
      - bot_token: 'BOT_TOKEN'
        chat_id: CHAT_ID
        parse_mode: 'Markdown'
        message: '{{ template "telegram.warning.message" . }}'
        send_resolved: true
    
    - name: 'telegram-db-team'
      telegram_configs:
      - bot_token: 'BOT_TOKEN'
        chat_id: CHAT_ID
        parse_mode: 'Markdown'
        message: '{{ template "telegram.db.message" . }}'
        send_resolved: true
    
    - name: 'telegram-default'
      telegram_configs:
      - bot_token: 'BOT_TOKEN'
        chat_id: CHAT_ID
        parse_mode: 'Markdown'
        message: '{{ template "telegram.default.message" . }}'
        send_resolved: true
    
    # проверка конфига
    amtool check-config /etc/alertmanager/alertmanager.yml
    # проверка шаблонов
    amtool check-config /etc/alertmanager/alertmanager.yml --template-files /etc/alertmanager/templates/*.tmpl
    
    systemctl restart prometheus alertmanager
    

  • /etc/prometheus/rules/*_rules.yml

    groups:
    - name: Infrastructure
      rules:
      # CPU
      - alert: HighCpuUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100 > 80
        for: 10m
        labels:
          severity: warning
          category: infra
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}% for 10 minutes."
    
      # Memory
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 85
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value }}% for 15 minutes."
    
      # Disk
      - alert: LowDiskSpace
        expr: (node_filesystem_avail_bytes{mountpoint=~"/|/var", fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint=~"/|/var"} * 100) < 15
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.mountpoint }} ({{ $labels.instance }})"
          description: "Only {{ printf \"%.2f\" $value }}% space left on {{ $labels.mountpoint }}."
    
      # Network
      - alert: HighNetworkErrors
        expr: rate(node_network_transmit_errs_total[2m]) + rate(node_network_receive_errs_total[2m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Network errors on {{ $labels.instance }}"
    
    - name: ServiceHealth
      rules:
      # Service Availability
      - alert: ServiceDown
        expr: up == 0
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} down on {{ $labels.instance }}"
          description: "The service has been down for more than 3 minutes."
    
    - name: PrometheusMonitoring
      rules:
      # Prometheus self-monitoring
      - alert: PrometheusDown
        expr: up{job="prometheus"} == 0
        for: 5m
        labels:
          severity: critical
          category: monitoring
        annotations:
          summary: "Prometheus is unreachable"
          description: "Prometheus is up==0 for more than 5 minutes."
    
      # Exporter monitoring (Node Exporter)
      - alert: NodeExporterDown
        expr: up{job="node"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Node Exporter down on {{ $labels.instance }}"
    
      # Alertmanager monitoring
      - alert: AlertmanagerDown
        expr: up{job="alertmanager"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Alertmanager is unreachable"
    
    # проверить
    promtool check rules /etc/prometheus/rules/*.yml
    

Blackbox Exporter#

  • Создаём пользователя

    useradd --no-create-home --shell /usr/sbin/nologin blackbox_exporter
    

  • Устанавливаем

    wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.26.0/blackbox_exporter-0.26.0.linux-amd64.tar.gz
    tar -xzvf blackbox_exporter-0.26.0.linux-amd64.tar.gz
    mv blackbox_exporter-0.26.0.linux-amd64/blackbox_exporter /usr/local/bin/
    chown blackbox_exporter:blackbox_exporter /usr/local/bin/blackbox_exporter
    rm -rf blackbox_exporter*
    

  • Структура конфига Blackbox exporter

    modules:
      <module_name>:
        prober: <type>
        <type_specific_settings>
    

  • Конфигурируем экспортер

    mkdir /etc/blackbox_exporter
    

  • vi /etc/blackbox_exporter/blackbox.yml

    modules:
      http_2xx:
        prober: http
        timeout: 5s
        http:
          valid_status_codes:
            - 200
            - 201
            - 202
            - 203
            - 204
            - 205
            - 206
            - 207
            - 208
            - 226
    
      http_auth_2xx:
        prober: http
        timeout: 5s
        http:
          headers:
            # логин:пароль в base64 (echo -n login:pass | base64)
            Authorization: "Basic YWRtRTc0fakehashfdVnZldWE=" 
          valid_status_codes:
            - 200
            - 201
            - 202
            - 203
            - 204
            - 205
            - 206
            - 207
            - 208
            - 226
    
      tcp_connect:
        prober: tcp
        timeout: 5s
    
      postgres_tcp:
        prober: tcp
        tcp:
          tls: false
    
      mariadb_tcp:
        prober: tcp
        tcp:
          query_response:
            - expect: "^"
          tls: false
    
      vm_icmp:
        prober: icmp
        timeout: 3s
    

chown -R blackbox_exporter:blackbox_exporter /etc/blackbox_exporter
  • vi /etc/systemd/system/blackbox_exporter.service

    [Unit]
    Description=Prometheus Blackbox Exporter
    Wants=network-online.target
    After=network-online.target
    
    [Service]
    User=blackbox_exporter
    Group=blackbox_exporter
    AmbientCapabilities=CAP_NET_RAW
    CapabilityBoundingSet=CAP_NET_RAW
    Type=simple
    ExecStart=/usr/local/bin/blackbox_exporter \
      --config.file=/etc/blackbox_exporter/blackbox.yml \
      --web.listen-address="127.0.0.1:9115"
    Restart=on-failure
    
    [Install]
    WantedBy=multi-user.target
    

  • Запускаем экспортер

    systemctl daemon-reload
    systemctl enable --now blackbox_exporter
    systemctl status blackbox_exporter
    

  • Проверка

    curl http://127.0.0.1:9115/metrics
    

  • Настроим доступ к экспортеру по HTTPS vi /etc/nginx/sites-available/blackbox.mcarov.pro

    server {
        server_name blackbox.mcarov.pro;
    
        location / {
            proxy_pass http://127.0.0.1:9115;
            proxy_set_header Host $host;
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_set_header X-Forwarded-Proto $scheme;
        }
    
        access_log /var/log/nginx/blackbox.mcarov.pro.access.log;
        error_log /var/log/nginx/blackbox.mcarov.pro.error.log;
    }
    

ln -s /etc/nginx/sites-available/blackbox.mcarov.pro /etc/nginx/sites-enabled/
nginx -t
systemctl reload nginx
certbot --nginx -d blackbox.mcarov.pro
  • Добавить в /etc/crontab для обновления сертов

    0 0,12 * * * root /opt/certbot/bin/python -c 'import random; import time; time.sleep(random.random() * 3600)' && sudo certbot renew -q
    

  • Базовая аутентификация

    apt install apache2-utils  
    mkdir -p /etc/nginx/blackbox
    htpasswd -c /etc/nginx/blackbox/.htpasswd admin
    

# Добавить в конфиг nginx
auth_basic "Blackbox auth";
auth_basic_user_file /etc/nginx/blackbox/.htpasswd; 
nginx -t 
systemctl reload nginx
  • vi /etc/prometheus/prometheus.yml
      - job_name: 'blackbox-http'
        metrics_path: /probe
        scheme: https
        basic_auth:
          username: 'admin'
          password: 'password'
        tls_config:
          insecure_skip_verify: false
        params:
          module: [http_2xx]
        static_configs:
          - targets:
            - https://grafana.mcarov.pro
            - https://git.mcarov.pro
            - https://wiki.mcarov.pro
            - https://minio.mcarov.pro
            - https://influx.mcarov.pro/ping
        relabel_configs: &relabel
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox.mcarov.pro
    
      - job_name: 'blackbox-http-auth'
        metrics_path: /probe
        scheme: https
        basic_auth:
          username: 'admin'
          password: 'password'
        tls_config:
          insecure_skip_verify: false
        params:
          module: [http_auth_2xx]
        static_configs:
          - targets:
              - https://alertmanager.mcarov.pro
              - https://prometheus.mcarov.pro
              - https://blackbox.mcarov.pro
        relabel_configs: *relabel
    
      - job_name: 'postgres-check'
        metrics_path: /probe
        scheme: https
        basic_auth:
          username: 'admin'
          password: 'password'
        tls_config:
          insecure_skip_verify: false
        params:
          module: [postgres_tcp]
        static_configs:
          - targets:
            - 127.0.0.1:5432
        relabel_configs: *relabel
    
      - job_name: 'mariadb-check'
        metrics_path: /probe
        scheme: https
        basic_auth:
          username: 'admin'
          password: 'password'
        tls_config:
          insecure_skip_verify: false
        params:
          module: [mariadb_tcp]
        static_configs:
          - targets:
            - 89.22.28.13:3306
        relabel_configs: *relabel
    
      - job_name: 'vm-ping'
        metrics_path: /probe
        scheme: https
        basic_auth:
          username: 'admin'
          password: 'password'
        tls_config:
          insecure_skip_verify: false
        params:
          module: [vm_icmp]
        static_configs:
          - targets:
            - 192.10.139.92
            - 150.21.66.94
            - 89.2.228.13
        relabel_configs: *relabel
    

ура, Алерты для блэкбокс#

  • vi blackbox_alerts.yml
    groups:
    - name: blackbox_exporter_alerts
      rules:
      - alert: ServiceDown
        expr: probe_success == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary:   "Сервис {{ $labels.instance }} недоступен"
          description: |
            Сервис {{ $labels.instance }} (job={{ $labels.job }})
            не отвечает уже более 2 минут.
    
      - alert: HighLatency
        expr: probe_success == 1 and avg_over_time(probe_duration_seconds[1m]) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary:   "Высокая задержка у {{ $labels.instance }}"
          description: |
            Сервис {{ $labels.instance }} (job={{ $labels.job }}) отвечает медленно:
            probe_duration_seconds={{ printf "%.3f" $value }}s (больше 1s)
            уже более 5 минут.
    
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: 0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: "TLS-сертификат истёкает для {{ $labels.instance }}"
          description: |
            Срок действия TLS-сертификата для {{ $labels.instance }} истёкает через 3 дня.
    
    
      - alert: TLSCertificateExpired
        expr: round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "TLS-сертификат истёк для {{ $labels.instance }}"
    
chown prometheus:prometheus /etc/prometheus/rules/blackbox_alerts.yml
  • Добавить в prometheus.yml

    rule_files:
      - 'rules/blackbox_alerts.yml'
    

  • promtool check config /etc/prometheus/prometheus.yml- проверка синтаксиса

  • promtool check rules /etc/prometheus/rules/blackbox_alerts.yml - проверка синтаксиса