Alloy OOMKilling at Scale - 770 Hosts Need Help

Alloy keeps crashing with OOM errors no matter how much memory we throw at it. Currently scraping 770 hosts with 5 exporters each and it maxes out at 2-2.5GB before dying.
If we allow up to 4g of ram it runs, but even more replicas wont reduce the load.
At 7 replicas it even kills coredns.

Setup:

  • Alloy 1.1.1 via Helm
  • 2-5 replicas
  • 5 node cluster (8c/32GB each)
  • Exporters: node, mysqld, process, postfix, elasticsearch
  • Mimir backend works fine

Current limits:

resources:
  limits:
    cpu: 1500m
    memory: 4Gi
extraEnv:
  - name: GOMEMLIMIT
    value: 3500MiB

What we’ve done:

  • Enabled clustering
  • Strict metric filtering (15 node metrics, 12 mysql, etc.)
  • Longer scrape intervals (30s/60s)
  • StatefulSet with 5Gi WAL storage
  • HPA scaling 1-7 replicas

Can’t deploy agents on the target hosts for now, so we’re stuck with centralized scraping. Anyone running Alloy at similar scale?

Helm values:

# Global configuration
global:
  podSecurityContext:
    runAsUser: 472
    runAsGroup: 472
    fsGroup: 472

# Controller configuration for StatefulSet
controller:
  type: statefulset
  replicas: 2
  
  # StatefulSet-specific configuration
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1

  # Anti-affinity to spread replicas across nodes
  affinity:
    podAntiAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 100
          podAffinityTerm:
            labelSelector:
              matchLabels:
                app.kubernetes.io/name: grafana-alloy-metrics
            topologyKey: kubernetes.io/hostname

  podDisruptionBudget:
    enabled: true
    minAvailable: 1

  volumes:
    extra:
      - name: scrape-targets
        configMap:
          name: scrape-targets
      - name: alloy-metrics-config
        configMap:
          name: alloy-metrics-config

  # Volume claim templates for WAL storage
  volumeClaimTemplates:
    - metadata:
        name: wal-storage
        labels:
          app.kubernetes.io/name: grafana-alloy-metrics
          app.kubernetes.io/component: wal-storage
      spec:
        accessModes: ["ReadWriteOnce"]
        resources:
          requests:
            storage: 5Gi
  
  # Horizontal Pod Autoscaler
  enableStatefulSetAutoDeletePVC: true
  autoscaling:
    horizontal:
      enabled: true
      minReplicas: 1
      maxReplicas: 3
      targetCPUUtilizationPercentage: 70
      targetMemoryUtilizationPercentage: 80

# Service account configuration
serviceAccount:
  create: true

# RBAC for metrics collection
rbac:
  create: true

# Configuration
alloy:
  # Use external ConfigMap created by Terraform
  configMap:
    create: false
    name: "alloy-metrics-config"
    key: config
  storagePath: /data/wal
  mounts:
    varlog: false
    dockercontainers: false
    extra:
      - name: wal-storage
        mountPath: /data/wal
        readOnly: false
      - name: alloy-metrics-config
        mountPath: /etc/alloy/config
        subPath: ""
        readOnly: true
      - name: scrape-targets
        mountPath: /etc/alloy/scrape-targets
        readOnly: true
  
  # Security context for container
  securityContext:
    capabilities:
      drop:
        - ALL
    readOnlyRootFilesystem: true
    runAsNonRoot: true
    runAsUser: 472
    runAsGroup: 472
  
  # Resource requests/limits for metrics collection
  resources:
    requests:
      cpu: 500m
      memory: 2Gi
    limits:
      cpu: 1500m
      memory: 4Gi
  
  # Environment variables
  extraEnv:
    - name: MIMIR_ADDRESS
      value: "${mimir_address}"
    - name: CLUSTER_NAME
      value: "${cluster_name}"
    - name: GOMEMLIMIT
      value: 3500MiB

# Service configuration
service:
  enabled: true
  type: ClusterIP

# Monitoring and observability
serviceMonitor:
  enabled: true
  interval: 30s

Alloy scraping config:

// =============================================================================
// EXPORTER SCRAPING MODULE
// =============================================================================

declare "exporter_scraping" {
  argument "metrics_destinations" {
    comment = "Must be a list of metric destinations where collected metrics should be forwarded to"
  }

  argument "clustering_enabled" {
    comment = "Whether to enable clustering for load distribution across Alloy instances"
    optional = true
    default = false
  }

  // =============================================================================
  // DYNAMIC DISCOVERY AND RELABELING PER EXPORTER TYPE
  // =============================================================================

  // Node Exporter Discovery
  discovery.file "nodeexporter_files" {
    files = ["/etc/alloy/scrape-targets/nodeexporter-targets.json"]
    refresh_interval = "30s"
  }

  discovery.relabel "nodeexporter_targets" {
    targets = discovery.file.nodeexporter_files.targets
    
    // Extract instance name (hostname without port)
    rule {
      source_labels = ["__address__"]
      target_label  = "instance"
      regex         = "([^:]+):.*"
      replacement   = "${1}"
    }
    
    // Set exporter label
    rule {
      replacement = "nodeexporter-targets"
      target_label = "exporter"
    }

    // Set source label for consistency
    rule {
      replacement = "exporters"
      target_label = "source"
    }
  }

  // MySQL Exporter Discovery
  discovery.file "mysqld_files" {
    files = ["/etc/alloy/scrape-targets/mysqld-exporter.json"]
    refresh_interval = "30s"
  }

  discovery.relabel "mysqld_targets" {
    targets = discovery.file.mysqld_files.targets
    
    // Extract instance name (hostname without port)
    rule {
      source_labels = ["__address__"]
      target_label  = "instance"
      regex         = "([^:]+):.*"
      replacement   = "${1}"
    }
    
    // Set exporter label
    rule {
      replacement = "mysqld-exporter"
      target_label = "exporter"
    }

    // Set source label for consistency
    rule {
      replacement = "exporters"
      target_label = "source"
    }
  }

  // Process Exporter Discovery
  discovery.file "process_files" {
    files = ["/etc/alloy/scrape-targets/process-exporter.json"]
    refresh_interval = "30s"
  }

  discovery.relabel "process_targets" {
    targets = discovery.file.process_files.targets
    
    // Extract instance name (hostname without port)
    rule {
      source_labels = ["__address__"]
      target_label  = "instance"
      regex         = "([^:]+):.*"
      replacement   = "${1}"
    }
    
    // Set exporter label
    rule {
      replacement = "process-exporter"
      target_label = "exporter"
    }

    // Set source label for consistency
    rule {
      replacement = "exporters"
      target_label = "source"
    }
  }

  // Elasticsearch Exporter Discovery
  discovery.file "elastic_files" {
    files = ["/etc/alloy/scrape-targets/elastic-exporter.json"]
    refresh_interval = "30s"
  }

  discovery.relabel "elastic_targets" {
    targets = discovery.file.elastic_files.targets
    
    // Extract instance name (hostname without port)
    rule {
      source_labels = ["__address__"]
      target_label  = "instance"
      regex         = "([^:]+):.*"
      replacement   = "${1}"
    }
    
    // Set exporter label
    rule {
      replacement = "elastic-exporter"
      target_label = "exporter"
    }

    // Set source label for consistency
    rule {
      replacement = "exporters"
      target_label = "source"
    }
  }

  // Postfix Exporter Discovery
  discovery.file "postfix_files" {
    files = ["/etc/alloy/scrape-targets/postfix-exporter.json"]
    refresh_interval = "30s"
  }

  discovery.relabel "postfix_targets" {
    targets = discovery.file.postfix_files.targets
    
    // Extract instance name (hostname without port)
    rule {
      source_labels = ["__address__"]
      target_label  = "instance"
      regex         = "([^:]+):.*"
      replacement   = "${1}"
    }
    
    // Set exporter label
    rule {
      replacement = "postfix-exporter"
      target_label = "exporter"
    }

    // Set source label for consistency
    rule {
      replacement = "exporters"
      target_label = "source"
    }
  }

  // =============================================================================
  // INDIVIDUAL SCRAPE JOBS PER EXPORTER
  // =============================================================================

  // Node Exporter Scraping (15s)
  prometheus.scrape "nodeexporter_targets" {
    targets = discovery.relabel.nodeexporter_targets.output
    
    job_name         = "integrations/node-exporter"
    scrape_interval  = "30s"
    scrape_timeout   = "20s"
    metrics_path     = "/metrics"
    honor_timestamps = true
    honor_labels     = true
    
    clustering {
      enabled = argument.clustering_enabled.value
    }
    
    forward_to = [prometheus.relabel.nodeexporter.receiver]
  }

  prometheus.relabel "nodeexporter" {
    // max_cache_size = 100000
    rule {
      source_labels = ["__name__"]
      regex = string.join([
        // Scrape health metrics
        "up",
        "scrape_samples_scraped",
        
        // CPU metrics
        "node_cpu_seconds_total",
        
        // Memory metrics
        "node_memory_MemTotal_bytes",
        "node_memory_MemAvailable_bytes", 
        "node_memory_Buffers_bytes",
        "node_memory_Cached_bytes",
        
        // Filesystem metrics
        "node_filesystem_avail_bytes",
        "node_filesystem_size_bytes",
        
        // Disk I/O metrics
        "node_disk_read_bytes_total",
        "node_disk_written_bytes_total",
        
        // Network metrics
        "node_network_receive_bytes_total",
        "node_network_transmit_bytes_total",
        
        // Load average metrics
        "node_load1",
        "node_load5", 
        "node_load15",
      ], "|")
      action = "keep"
    }
    // Drop metrics for certain file systems
    rule {
      source_labels = ["__name__", "fstype"]
      separator = "@"
      regex = "node_filesystem.*@(ramfs|tmpfs)"
      action = "drop"
    }
    forward_to = argument.metrics_destinations.value
  }

  // MySQL Exporter Scraping (60s)
  prometheus.scrape "mysqld_exporter" {
    targets = discovery.relabel.mysqld_targets.output
    
    job_name         = "integrations/mysql-exporter"
    scrape_interval  = "60s"
    scrape_timeout   = "20s"
    metrics_path     = "/metrics"
    honor_timestamps = true
    honor_labels     = true
    
    clustering {
      enabled = argument.clustering_enabled.value
    }
    
    forward_to = [prometheus.relabel.mysqld.receiver]
  }

  prometheus.relabel "mysqld" {
    // max_cache_size = 100000
    rule {
      source_labels = ["__name__"]
      regex = string.join([
        // Scrape health metrics
        "up",
        "scrape_samples_scraped",
        
        // MySQL availability
        "mysql_up",
        
        // Connection metrics
        "mysql_global_status_connections",
        "mysql_global_status_threads_connected",
        "mysql_global_status_threads_running",
        "mysql_global_variables_max_connections",
        
        // Query performance metrics
        "mysql_global_status_slow_queries",
        "mysql_global_status_queries",
        
        // InnoDB buffer pool metrics
        "mysql_global_status_innodb_buffer_pool_pages_free",
        "mysql_global_status_innodb_buffer_pool_pages_total",
        "mysql_global_status_innodb_buffer_pool_read_requests",
      ], "|")
      action = "keep"
    }
    forward_to = argument.metrics_destinations.value
  }

  // Elasticsearch Exporter Scraping (60s)
  prometheus.scrape "elastic_exporter" {
    targets = discovery.relabel.elastic_targets.output
    
    job_name         = "integrations/elasticsearch-exporter"
    scrape_interval  = "60s"
    scrape_timeout   = "20s"
    metrics_path     = "/metrics"
    honor_timestamps = true
    honor_labels     = true
    
    clustering {
      enabled = argument.clustering_enabled.value
    }
    
    forward_to = [prometheus.relabel.elasticsearch.receiver]
  }

  prometheus.relabel "elasticsearch" {
    // max_cache_size = 100000
    rule {
      source_labels = ["__name__"]
      regex = string.join([
        // Scrape health metrics
        "up",
        "scrape_samples_scraped",
        
        // Cluster health metrics
        "elasticsearch_cluster_health_status",
        "elasticsearch_cluster_health_number_of_nodes",
        "elasticsearch_cluster_health_active_primary_shards",
        "elasticsearch_cluster_health_relocating_shards",
        
        // Index metrics
        "elasticsearch_indices_docs",
        "elasticsearch_indices_store_size_bytes",
        
        // JVM metrics
        "elasticsearch_jvm_memory_used_bytes",
        "elasticsearch_jvm_gc_collection_seconds_total",
      ], "|")
      action = "keep"
    }
    forward_to = argument.metrics_destinations.value
  }

  // Postfix Exporter Scraping (60s)
  prometheus.scrape "postfix_exporter" {
    targets = discovery.relabel.postfix_targets.output
    
    job_name         = "integrations/postfix-exporter"
    scrape_interval  = "60s"
    scrape_timeout   = "20s"
    metrics_path     = "/metrics"
    honor_timestamps = true
    honor_labels     = true
    
    clustering {
      enabled = argument.clustering_enabled.value
    }
    
    forward_to = [prometheus.relabel.postfix.receiver]
  }

  prometheus.relabel "postfix" {
    // max_cache_size = 100000
    rule {
      source_labels = ["__name__"]
      regex = string.join([
        // Scrape health metrics
        "up",
        "scrape_samples_scraped",
        
        // Postfix availability
        "postfix_up",
        
        // Queue metrics
        "postfix_queue_length",
        "postfix_showq_message_size_bytes",
        
        // Mail processing metrics
        "postfix_bounce_non_delivery_notification_total",
        "postfix_cleanup_messages_processed_total",
      ], "|")
      action = "keep"
    }
    forward_to = argument.metrics_destinations.value
  }
}

// =============================================================================
// INSTANTIATE THE EXPORTER SCRAPING MODULE
// =============================================================================

exporter_scraping "feature" {
  metrics_destinations = [
    prometheus.remote_write.mimir.receiver,
  ]
  clustering_enabled = true
}

Mimir config:

prometheus.remote_write "mimir" {
  endpoint {
    url = string.format("%s/api/v1/push", sys.env("MIMIR_ADDRESS"))
    basic_auth {
      username = convert.nonsensitive(remote.kubernetes.secret.alloy_receiver.data["mimir_username"])
      password = remote.kubernetes.secret.alloy_receiver.data["mimir_password"]
    }

    send_native_histograms = false

    write_relabel_config {
      source_labels = ["cluster"]
      regex = ""
      replacement = sys.env("CLUSTER_NAME")
      target_label = "cluster"
    }
    write_relabel_config {
      source_labels = ["k8s_cluster_name"]
      regex = ""
      replacement = sys.env("CLUSTER_NAME")
      target_label = "k8s_cluster_name"
    }
  }
}

Thanks!

how big are these json files in /etc/alloy/scrape-targets/

I see a lot of regex happening on them. wonder if those are causing the OOM

can you try doing the scrape without regex in non prod instance and see how it performs on one sample json file