Alloy keeps crashing with OOM errors no matter how much memory we throw at it. Currently scraping 770 hosts with 5 exporters each and it maxes out at 2-2.5GB before dying.
If we allow up to 4g of ram it runs, but even more replicas wont reduce the load.
At 7 replicas it even kills coredns.
Setup:
- Alloy 1.1.1 via Helm
- 2-5 replicas
- 5 node cluster (8c/32GB each)
- Exporters: node, mysqld, process, postfix, elasticsearch
- Mimir backend works fine
Current limits:
resources:
limits:
cpu: 1500m
memory: 4Gi
extraEnv:
- name: GOMEMLIMIT
value: 3500MiB
What we’ve done:
- Enabled clustering
- Strict metric filtering (15 node metrics, 12 mysql, etc.)
- Longer scrape intervals (30s/60s)
- StatefulSet with 5Gi WAL storage
- HPA scaling 1-7 replicas
Can’t deploy agents on the target hosts for now, so we’re stuck with centralized scraping. Anyone running Alloy at similar scale?
Helm values:
# Global configuration
global:
podSecurityContext:
runAsUser: 472
runAsGroup: 472
fsGroup: 472
# Controller configuration for StatefulSet
controller:
type: statefulset
replicas: 2
# StatefulSet-specific configuration
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
# Anti-affinity to spread replicas across nodes
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app.kubernetes.io/name: grafana-alloy-metrics
topologyKey: kubernetes.io/hostname
podDisruptionBudget:
enabled: true
minAvailable: 1
volumes:
extra:
- name: scrape-targets
configMap:
name: scrape-targets
- name: alloy-metrics-config
configMap:
name: alloy-metrics-config
# Volume claim templates for WAL storage
volumeClaimTemplates:
- metadata:
name: wal-storage
labels:
app.kubernetes.io/name: grafana-alloy-metrics
app.kubernetes.io/component: wal-storage
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi
# Horizontal Pod Autoscaler
enableStatefulSetAutoDeletePVC: true
autoscaling:
horizontal:
enabled: true
minReplicas: 1
maxReplicas: 3
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# Service account configuration
serviceAccount:
create: true
# RBAC for metrics collection
rbac:
create: true
# Configuration
alloy:
# Use external ConfigMap created by Terraform
configMap:
create: false
name: "alloy-metrics-config"
key: config
storagePath: /data/wal
mounts:
varlog: false
dockercontainers: false
extra:
- name: wal-storage
mountPath: /data/wal
readOnly: false
- name: alloy-metrics-config
mountPath: /etc/alloy/config
subPath: ""
readOnly: true
- name: scrape-targets
mountPath: /etc/alloy/scrape-targets
readOnly: true
# Security context for container
securityContext:
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 472
runAsGroup: 472
# Resource requests/limits for metrics collection
resources:
requests:
cpu: 500m
memory: 2Gi
limits:
cpu: 1500m
memory: 4Gi
# Environment variables
extraEnv:
- name: MIMIR_ADDRESS
value: "${mimir_address}"
- name: CLUSTER_NAME
value: "${cluster_name}"
- name: GOMEMLIMIT
value: 3500MiB
# Service configuration
service:
enabled: true
type: ClusterIP
# Monitoring and observability
serviceMonitor:
enabled: true
interval: 30s
Alloy scraping config:
// =============================================================================
// EXPORTER SCRAPING MODULE
// =============================================================================
declare "exporter_scraping" {
argument "metrics_destinations" {
comment = "Must be a list of metric destinations where collected metrics should be forwarded to"
}
argument "clustering_enabled" {
comment = "Whether to enable clustering for load distribution across Alloy instances"
optional = true
default = false
}
// =============================================================================
// DYNAMIC DISCOVERY AND RELABELING PER EXPORTER TYPE
// =============================================================================
// Node Exporter Discovery
discovery.file "nodeexporter_files" {
files = ["/etc/alloy/scrape-targets/nodeexporter-targets.json"]
refresh_interval = "30s"
}
discovery.relabel "nodeexporter_targets" {
targets = discovery.file.nodeexporter_files.targets
// Extract instance name (hostname without port)
rule {
source_labels = ["__address__"]
target_label = "instance"
regex = "([^:]+):.*"
replacement = "${1}"
}
// Set exporter label
rule {
replacement = "nodeexporter-targets"
target_label = "exporter"
}
// Set source label for consistency
rule {
replacement = "exporters"
target_label = "source"
}
}
// MySQL Exporter Discovery
discovery.file "mysqld_files" {
files = ["/etc/alloy/scrape-targets/mysqld-exporter.json"]
refresh_interval = "30s"
}
discovery.relabel "mysqld_targets" {
targets = discovery.file.mysqld_files.targets
// Extract instance name (hostname without port)
rule {
source_labels = ["__address__"]
target_label = "instance"
regex = "([^:]+):.*"
replacement = "${1}"
}
// Set exporter label
rule {
replacement = "mysqld-exporter"
target_label = "exporter"
}
// Set source label for consistency
rule {
replacement = "exporters"
target_label = "source"
}
}
// Process Exporter Discovery
discovery.file "process_files" {
files = ["/etc/alloy/scrape-targets/process-exporter.json"]
refresh_interval = "30s"
}
discovery.relabel "process_targets" {
targets = discovery.file.process_files.targets
// Extract instance name (hostname without port)
rule {
source_labels = ["__address__"]
target_label = "instance"
regex = "([^:]+):.*"
replacement = "${1}"
}
// Set exporter label
rule {
replacement = "process-exporter"
target_label = "exporter"
}
// Set source label for consistency
rule {
replacement = "exporters"
target_label = "source"
}
}
// Elasticsearch Exporter Discovery
discovery.file "elastic_files" {
files = ["/etc/alloy/scrape-targets/elastic-exporter.json"]
refresh_interval = "30s"
}
discovery.relabel "elastic_targets" {
targets = discovery.file.elastic_files.targets
// Extract instance name (hostname without port)
rule {
source_labels = ["__address__"]
target_label = "instance"
regex = "([^:]+):.*"
replacement = "${1}"
}
// Set exporter label
rule {
replacement = "elastic-exporter"
target_label = "exporter"
}
// Set source label for consistency
rule {
replacement = "exporters"
target_label = "source"
}
}
// Postfix Exporter Discovery
discovery.file "postfix_files" {
files = ["/etc/alloy/scrape-targets/postfix-exporter.json"]
refresh_interval = "30s"
}
discovery.relabel "postfix_targets" {
targets = discovery.file.postfix_files.targets
// Extract instance name (hostname without port)
rule {
source_labels = ["__address__"]
target_label = "instance"
regex = "([^:]+):.*"
replacement = "${1}"
}
// Set exporter label
rule {
replacement = "postfix-exporter"
target_label = "exporter"
}
// Set source label for consistency
rule {
replacement = "exporters"
target_label = "source"
}
}
// =============================================================================
// INDIVIDUAL SCRAPE JOBS PER EXPORTER
// =============================================================================
// Node Exporter Scraping (15s)
prometheus.scrape "nodeexporter_targets" {
targets = discovery.relabel.nodeexporter_targets.output
job_name = "integrations/node-exporter"
scrape_interval = "30s"
scrape_timeout = "20s"
metrics_path = "/metrics"
honor_timestamps = true
honor_labels = true
clustering {
enabled = argument.clustering_enabled.value
}
forward_to = [prometheus.relabel.nodeexporter.receiver]
}
prometheus.relabel "nodeexporter" {
// max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = string.join([
// Scrape health metrics
"up",
"scrape_samples_scraped",
// CPU metrics
"node_cpu_seconds_total",
// Memory metrics
"node_memory_MemTotal_bytes",
"node_memory_MemAvailable_bytes",
"node_memory_Buffers_bytes",
"node_memory_Cached_bytes",
// Filesystem metrics
"node_filesystem_avail_bytes",
"node_filesystem_size_bytes",
// Disk I/O metrics
"node_disk_read_bytes_total",
"node_disk_written_bytes_total",
// Network metrics
"node_network_receive_bytes_total",
"node_network_transmit_bytes_total",
// Load average metrics
"node_load1",
"node_load5",
"node_load15",
], "|")
action = "keep"
}
// Drop metrics for certain file systems
rule {
source_labels = ["__name__", "fstype"]
separator = "@"
regex = "node_filesystem.*@(ramfs|tmpfs)"
action = "drop"
}
forward_to = argument.metrics_destinations.value
}
// MySQL Exporter Scraping (60s)
prometheus.scrape "mysqld_exporter" {
targets = discovery.relabel.mysqld_targets.output
job_name = "integrations/mysql-exporter"
scrape_interval = "60s"
scrape_timeout = "20s"
metrics_path = "/metrics"
honor_timestamps = true
honor_labels = true
clustering {
enabled = argument.clustering_enabled.value
}
forward_to = [prometheus.relabel.mysqld.receiver]
}
prometheus.relabel "mysqld" {
// max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = string.join([
// Scrape health metrics
"up",
"scrape_samples_scraped",
// MySQL availability
"mysql_up",
// Connection metrics
"mysql_global_status_connections",
"mysql_global_status_threads_connected",
"mysql_global_status_threads_running",
"mysql_global_variables_max_connections",
// Query performance metrics
"mysql_global_status_slow_queries",
"mysql_global_status_queries",
// InnoDB buffer pool metrics
"mysql_global_status_innodb_buffer_pool_pages_free",
"mysql_global_status_innodb_buffer_pool_pages_total",
"mysql_global_status_innodb_buffer_pool_read_requests",
], "|")
action = "keep"
}
forward_to = argument.metrics_destinations.value
}
// Elasticsearch Exporter Scraping (60s)
prometheus.scrape "elastic_exporter" {
targets = discovery.relabel.elastic_targets.output
job_name = "integrations/elasticsearch-exporter"
scrape_interval = "60s"
scrape_timeout = "20s"
metrics_path = "/metrics"
honor_timestamps = true
honor_labels = true
clustering {
enabled = argument.clustering_enabled.value
}
forward_to = [prometheus.relabel.elasticsearch.receiver]
}
prometheus.relabel "elasticsearch" {
// max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = string.join([
// Scrape health metrics
"up",
"scrape_samples_scraped",
// Cluster health metrics
"elasticsearch_cluster_health_status",
"elasticsearch_cluster_health_number_of_nodes",
"elasticsearch_cluster_health_active_primary_shards",
"elasticsearch_cluster_health_relocating_shards",
// Index metrics
"elasticsearch_indices_docs",
"elasticsearch_indices_store_size_bytes",
// JVM metrics
"elasticsearch_jvm_memory_used_bytes",
"elasticsearch_jvm_gc_collection_seconds_total",
], "|")
action = "keep"
}
forward_to = argument.metrics_destinations.value
}
// Postfix Exporter Scraping (60s)
prometheus.scrape "postfix_exporter" {
targets = discovery.relabel.postfix_targets.output
job_name = "integrations/postfix-exporter"
scrape_interval = "60s"
scrape_timeout = "20s"
metrics_path = "/metrics"
honor_timestamps = true
honor_labels = true
clustering {
enabled = argument.clustering_enabled.value
}
forward_to = [prometheus.relabel.postfix.receiver]
}
prometheus.relabel "postfix" {
// max_cache_size = 100000
rule {
source_labels = ["__name__"]
regex = string.join([
// Scrape health metrics
"up",
"scrape_samples_scraped",
// Postfix availability
"postfix_up",
// Queue metrics
"postfix_queue_length",
"postfix_showq_message_size_bytes",
// Mail processing metrics
"postfix_bounce_non_delivery_notification_total",
"postfix_cleanup_messages_processed_total",
], "|")
action = "keep"
}
forward_to = argument.metrics_destinations.value
}
}
// =============================================================================
// INSTANTIATE THE EXPORTER SCRAPING MODULE
// =============================================================================
exporter_scraping "feature" {
metrics_destinations = [
prometheus.remote_write.mimir.receiver,
]
clustering_enabled = true
}
Mimir config:
prometheus.remote_write "mimir" {
endpoint {
url = string.format("%s/api/v1/push", sys.env("MIMIR_ADDRESS"))
basic_auth {
username = convert.nonsensitive(remote.kubernetes.secret.alloy_receiver.data["mimir_username"])
password = remote.kubernetes.secret.alloy_receiver.data["mimir_password"]
}
send_native_histograms = false
write_relabel_config {
source_labels = ["cluster"]
regex = ""
replacement = sys.env("CLUSTER_NAME")
target_label = "cluster"
}
write_relabel_config {
source_labels = ["k8s_cluster_name"]
regex = ""
replacement = sys.env("CLUSTER_NAME")
target_label = "k8s_cluster_name"
}
}
}
Thanks!