All prom instances remote write to it's mimir instance All prom instances alert to the common alertmanager
Last active
December 13, 2024 15:15
-
-
Save theinhumaneme/ac5264620086b8c5722e13230ef5f808 to your computer and use it in GitHub Desktop.
NODE-MONITORING
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| networks: | |
| monitoring: | |
| external: true | |
| services: | |
| alertmanager: | |
| image: prom/alertmanager:v0.27.0 | |
| container_name: alertmanager | |
| restart: unless-stopped | |
| ports: | |
| - "9093:9093" | |
| volumes: | |
| - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml | |
| networks: | |
| - monitoring | |
| calert: | |
| container_name: calert | |
| image: ghcr.io/mr-karan/calert:latest | |
| restart: unless-stopped | |
| ports: | |
| - "6000:6000" | |
| volumes: | |
| - ./ca-alert-config.toml:/app/config.sample.toml | |
| - ./ca-alert-message.tmpl:/etc/calert/message.tmpl | |
| networks: | |
| - monitoring |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| global: | |
| resolve_timeout: 1m | |
| route: | |
| receiver: alerts | |
| group_by: ['alertname'] # Group alerts by the 'alertname' label | |
| group_wait: 0s # Send alerts immediately without waiting | |
| group_interval: 2m # Wait time between groups of alerts | |
| repeat_interval: 2m # Repeat notifications every 1 minute if the alert is still firing | |
| receivers: | |
| - name: 'alerts' | |
| webhook_configs: | |
| - url: 'http://calert:6000/dispatch' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [app] | |
| address = "0.0.0.0:6000" # Address of the HTTP Server. | |
| server_timeout = "60s" # Server timeout for HTTP requests. | |
| enable_request_logs = true # Whether to log incoming HTTP requests or not. | |
| log = "info" # Use `debug` to enable verbose logging. Can be set to `info` otherwise. | |
| [providers.alerts] | |
| type = "google_chat" # Type of provider. Currently supported value is `google_chat`. | |
| endpoint = "..." # Google Chat Webhook URL | |
| max_idle_conns = 50 # Max idle connections in the HTTP Client. | |
| timeout = "30s" # Timeout for making requests to Provider. | |
| # proxy_url = "http://internal-squid-proxy.com:3128" # Specify `proxy_url` as your proxy endpoint to route all HTTP requests to the provider via a proxy. | |
| template = "static/message.tmpl" # Path to specify the message template path. | |
| thread_ttl = "2h" # Timeout to keep active alerts in memory. Once this TTL expires, a new thread will be created. | |
| #threaded_replies = true # Whether to send threaded replies or not. | |
| dry_run = false |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| *{{.Labels.severity | toUpper }}*\n | |
| {{ .Labels.alertname }} - {{.Status | toUpper }}* | |
| {{ range .Annotations -}} | |
| {{ .Value}} | |
| {{ end -}} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| groups: | |
| - name: host_alerts | |
| rules: | |
| - alert: "High Host CPU Usage" | |
| expr: (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 40 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| # summary: "High CPU usage on host {{ $labels.instance }}" | |
| description: "*Host* - {{ $labels.instance }}\nHost CPU usage has exceeded 40% for the last 2 minutes." | |
| - alert: "Low Host Memory" | |
| expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.2 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| # summary: "Low memory on host {{ $labels.instance }}" | |
| description: "*Host* - {{ $labels.instance }}\nAvailable memory is below 20% on the host." | |
| - alert: "High Host Disk Usage" | |
| expr: (node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_avail_bytes{fstype!="tmpfs"}) / node_filesystem_size_bytes{fstype!="tmpfs"} > 0.8 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| # summary: "High disk usage on host {{ $labels.instance }}" | |
| description: "*Host* - {{ $labels.instance }}\nDisk usage on {{ $labels.mountpoint }} has exceeded 80%." |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| groups: | |
| - name: container_recording_rules | |
| rules: | |
| # Precompute container CPU usage | |
| - record: container:cpu_usage:rate | |
| expr: rate(container_cpu_usage_seconds_total{name!=""}[5m]) | |
| # Precompute container memory usage | |
| - record: container:memory_working_set | |
| expr: container_memory_working_set_bytes{image!=""} | |
| # Precompute container last seen time difference | |
| - record: container:last_seen:time_diff | |
| expr: time() - max(container_last_seen) without (id) | |
| # Precompute container restart rate | |
| - record: container:restart_rate | |
| expr: rate(container_restart_count[5m]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| services: | |
| prometheus: | |
| user: 0:0 | |
| container_name: prometheus | |
| image: prom/prometheus:v3.0.1 | |
| restart: always | |
| command: | |
| - '--config.file=/etc/prometheus/prometheus.yml' | |
| - '--storage.tsdb.path=/prometheus' | |
| - '--storage.tsdb.retention.time=6h' | |
| - '--web.console.libraries=/etc/prometheus/console_libraries' | |
| - '--web.console.templates=/etc/prometheus/consoles' | |
| - '--web.enable-lifecycle' | |
| volumes: | |
| - ./temp-prometheus:/prometheus | |
| - ./prometheus.yml:/etc/prometheus/prometheus.yml | |
| - ./host_recording_rules.yml:/etc/prometheus/host_recording_rules.yml | |
| - ./container_recording_rules.yml:/etc/prometheus/container_recording_rules.yml | |
| - ./host_alert_rules.yml:/etc/prometheus/host_alert_rules.yml | |
| - ./container_alert_rules.yml:/etc/prometheus/container_alert_rules.yml | |
| depends_on: | |
| - node-exporter | |
| - cadvisor | |
| ports: | |
| - "9090:9090" | |
| networks: | |
| - monitoring | |
| node-exporter: | |
| image: prom/node-exporter:v1.8.2 | |
| container_name: node-exporter | |
| restart: unless-stopped | |
| volumes: | |
| - /proc:/host/proc:ro | |
| - /sys:/host/sys:ro | |
| - /:/rootfs:ro | |
| command: | |
| - '--path.procfs=/host/proc' | |
| - '--path.rootfs=/rootfs' | |
| - '--path.sysfs=/host/sys' | |
| - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' | |
| expose: | |
| - 9100 | |
| networks: | |
| - monitoring | |
| cadvisor: | |
| container_name: cadvisor | |
| image: gcr.io/cadvisor/cadvisor:v0.49.1 | |
| expose: | |
| - 8080 | |
| volumes: | |
| - "/:/rootfs" | |
| - "/var/run:/var/run" | |
| - "/sys:/sys" | |
| - "/var/lib/docker/:/var/lib/docker" | |
| - "/dev/disk/:/dev/disk" | |
| privileged: true | |
| command: | |
| - "--docker_only=true" | |
| - "--allow_dynamic_housekeeping=false" | |
| - "--global_housekeeping_interval=5s" | |
| - "--housekeeping_interval=5s" | |
| devices: | |
| - "/dev/kmsg" | |
| networks: | |
| - monitoring | |
| networks: | |
| monitoring: | |
| external: true |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| groups: | |
| - name: host_alerts | |
| rules: | |
| - alert: "High Host CPU Usage" | |
| expr: host:cpu_usage:avg > 40 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: "*Host* - {{ $labels.instance }}\nHost CPU usage has exceeded 40% for the last 2 minutes." | |
| - alert: "Low Host Memory" | |
| expr: host:memory_available:ratio < 0.2 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: "*Host* - {{ $labels.instance }}\nAvailable memory is below 20% on the host." | |
| - alert: "High Host Disk Usage" | |
| expr: host:disk_usage:ratio > 0.8 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: "*Host* - {{ $labels.instance }}\nDisk usage on {{ $labels.mountpoint }} has exceeded 80%." |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| groups: | |
| - name: host_recording_rules | |
| rules: | |
| # Precompute average CPU usage per host | |
| - record: host:cpu_usage:avg | |
| expr: (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 | |
| # Precompute memory availability as a fraction | |
| - record: host:memory_available:ratio | |
| expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes | |
| # Precompute disk usage as a fraction | |
| - record: host:disk_usage:ratio | |
| expr: (node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_avail_bytes{fstype!="tmpfs"}) / node_filesystem_size_bytes{fstype!="tmpfs"} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| multitenancy_enabled: false | |
| api: | |
| prometheus_http_prefix: '' | |
| blocks_storage: | |
| backend: s3 | |
| s3: | |
| endpoint: ... | |
| bucket_name: ... | |
| access_key_id: ... | |
| secret_access_key: ... | |
| insecure: true | |
| limits: | |
| # Enable TSDB block upload | |
| max_label_names_per_series: 100 | |
| compactor_block_upload_enabled: true | |
| compactor: | |
| data_dir: /tmp/mimir/compactor | |
| sharding_ring: | |
| kvstore: | |
| store: memberlist | |
| distributor: | |
| ring: | |
| instance_addr: 127.0.0.1 | |
| kvstore: | |
| store: memberlist | |
| ingester: | |
| ring: | |
| instance_addr: 127.0.0.1 | |
| kvstore: | |
| store: memberlist | |
| replication_factor: 1 | |
| server: | |
| http_listen_port: 9009 | |
| store_gateway: | |
| sharding_ring: | |
| replication_factor: 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| networks: | |
| monitoring: | |
| external: true | |
| services: | |
| mimir: | |
| image: grafana/mimir:2.13.1 | |
| container_name: mimir | |
| restart: unless-stopped | |
| ports: | |
| - "9009:9009" # API and query endpoint | |
| volumes: | |
| - ./mimir-config.yaml:/etc/mimir/mimir-config.yaml | |
| command: | |
| - "-config.file=/etc/mimir/mimir-config.yaml" | |
| networks: | |
| - monitoring |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| services: | |
| minio: | |
| image: quay.io/minio/minio:RELEASE.2024-11-07T00-52-20Z | |
| container_name: minio | |
| environment: | |
| MINIO_ROOT_USER: ... | |
| MINIO_ROOT_PASSWORD: ... | |
| MINIO_BROWSER_REDIRECT_URL: https://... | |
| command: server /data --console-address ":9001" | |
| ports: | |
| - "9000:9000" # MinIO API | |
| - "9001:9001" # MinIO Console | |
| volumes: | |
| - ./minio/data:/data | |
| networks: | |
| - monitoring | |
| networks: | |
| monitoring: | |
| external: true |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| global: | |
| scrape_interval: 10s | |
| evaluation_interval: 10s | |
| scrape_configs: | |
| - job_name: 'node' | |
| scrape_interval: 5s | |
| static_configs: | |
| - targets: ['node-exporter:9100'] | |
| labels: | |
| instance: "One O One - Kalyan Mudumby" | |
| - job_name: cadvisor | |
| scrape_interval: 5s | |
| static_configs: | |
| - targets: ['cadvisor:8080'] | |
| rule_files: | |
| - "host_recording_rules.yml" | |
| - "container_recording_rules.yml" | |
| - "host_alert_rules.yml" | |
| - "container_alert_rules.yml" | |
| alerting: | |
| alertmanagers: | |
| - scheme: https | |
| static_configs: | |
| - targets: | |
| - ... | |
| remote_write: | |
| - url: "https://.../api/v1/push" | |
| headers: | |
| X-Scope-OrgID: "anonymous" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment

