What is wrong with docker metrics and alloy?

Hi there,

It seems that I’m not the first to encounter this issue with Alloy in order to get docker metrics.

Here’s my config:

Using an alloy instance that scrapp host and host’s docker metrics and log, the other receives them and forward to mimir/loki server

Do you have any clue dear community ?

logging {
  level = "info"
}

livedebugging {
  enabled = true
}

prometheus.exporter.cadvisor "cadvisor" {
    store_container_labels = true
    docker_host = "unix:///rootfs/var/run/docker.sock"
    enabled_metrics = [
    "cpu", "sched", "percpu", "memory", "memory_numa", "cpuLoad", "diskIO", "disk",
    "network", "tcp", "advtcp", "udp", "app", "process", "hugetlb", "perf_event",
    "referenced_memory", "cpu_topology", "resctrl", "cpuset", "oom_event" ]
}

prometheus.scrape "scrape_cadvisor" {
  targets    = prometheus.exporter.cadvisor.cadvisor.targets
  job_name   = "host_cadvisor"
  
  forward_to = [prometheus.relabel.job_relabel.receiver]
}

/******* NODE_EXPORTER METRICS *******/

prometheus.exporter.unix "node_exporter" { 
  rootfs_path = "/rootfs"
  procfs_path = "/rootfs/proc"
  sysfs_path  = "/rootfs/sys"
}


prometheus.scrape "scrape_node_exporter" {
  targets    = prometheus.exporter.unix.node_exporter.targets
  job_name   = "host_metrics"
  honor_labels = true
  forward_to = [prometheus.relabel.job_relabel.receiver]
}

prometheus.relabel "job_relabel" {

  forward_to = [otelcol.receiver.prometheus.default.receiver]

  rule {
    target_label = "job"
    replacement = "sdc_monitoring_host"
  }

  rule {
    target_label = "group"
    replacement = "host"
  }
}

/******* LOGS *******/

local.file_match "collect_logs" {
  path_targets = [
    {"__path__" = "/rootfs/var/log/*.log"},
    {"__path__" = "/rootfs/var/log/**/*.log"},
    {"__path__" = "/rootfs/var/lib/docker/containers/**/*.log"},
  ]
}

loki.source.file "scrape_logs" {
  targets    = local.file_match.collect_logs.targets
  forward_to = [loki.process.add_label.receiver]
}

loki.process "add_label" {

  forward_to = [otelcol.receiver.loki.loki_tosend.receiver]

  stage.labels {
		values = {
		  group  = "host",
		}
	}
}

/******* DOCKER LOGS *******/

discovery.docker "host" {
  host = "unix:///rootfs/var/run/docker.sock"
}

discovery.relabel "logs_integrations_docker"   {
    targets = []
    rule {
        target_label = "job"
        replacement = "sdc_monitoring_host"
    }


    rule {
        target_label = "instance"
        replacement = constants.hostname
    }


    rule {
        source_labels = ["__meta_docker_container_name"]
        regex = "/(.*)"
        target_label = "container"
    }


    rule {
        source_labels = ["__meta_docker_container_log_stream"]
        target_label = "stream"
    }
}

loki.source.docker "default" {
  host       = "unix:///rootfs/var/run/docker.sock"
  targets    = discovery.docker.host.targets
  labels     = {"source" = "docker", "group" = "host"}
  relabel_rules = discovery.relabel.logs_integrations_docker.rules
  forward_to = [loki.process.docker.receiver]
  refresh_interval = "5s"
}

loki.process "docker" {
  forward_to = [otelcol.receiver.loki.loki_tosend.receiver]

  stage.docker {}

}

/******* OTELCOL RECEIVER *******/

otelcol.receiver.prometheus "default" {
	output {
		metrics = [otelcol.processor.memory_limiter.default.input]
	}
}

otelcol.receiver.loki "loki_tosend" {
  output {
    logs = [otelcol.processor.memory_limiter.default.input]
  }
}

/******* PROCESSORS *******/

otelcol.processor.memory_limiter "default" {
  check_interval    = "10s"
  limit             = "10000MiB"

  output {
    metrics = [otelcol.processor.batch.default.input]
    logs    = [otelcol.processor.batch.default.input]
    traces  = [otelcol.processor.batch.default.input]
  }
}

otelcol.processor.batch "default" {
  send_batch_size = 10000

  output {
    metrics   = [otelcol.exporter.otlp.export_to_monitoring.input]
    logs      = [otelcol.exporter.otlp.export_to_monitoring.input]
    traces    = [otelcol.exporter.otlp.export_to_monitoring.input]
  }
}

/******* EXPORT *******/



otelcol.exporter.otlp "export_to_monitoring" {
  client {
    endpoint = sys.env("ALLOY_HOST") + ":4317"
    tls {
      insecure_skip_verify = false
      cert_file = "/etc/certs/monitoring.crt"
      key_file  = "/etc/certs/monitoring.key"
      ca_file   = "/etc/certs/monitoring.crt"
    }
  }
}


otelcol.receiver.otlp "ingest" {
  grpc {
    endpoint = "0.0.0.0:4317"
    tls {
      cert_file = "/etc/certs/monitoring.crt"
      key_file  = "/etc/certs/monitoring.key"
      ca_file   = "/etc/certs/monitoring.crt"
      insecure_skip_verify = true
    }
  }

  http {
    endpoint = "0.0.0.0:4318"
    tls {
      cert_file = "/etc/certs/monitoring.crt"
      key_file  = "/etc/certs/monitoring.key"
      ca_file   = "/etc/certs/monitoring.crt"
      insecure_skip_verify = true
    }
  }

  output {
    metrics = [otelcol.processor.memory_limiter.default.input]
    logs    = [otelcol.processor.memory_limiter.default.input]
    traces  = [otelcol.processor.memory_limiter.default.input]
  }
}

otelcol.receiver.prometheus "prom_ingest" {
    output {
    metrics = [otelcol.processor.memory_limiter.default.input]
    logs    = [otelcol.processor.memory_limiter.default.input]
    traces  = [otelcol.processor.memory_limiter.default.input]
  }
}

// Scraping our own metrics for self-monitoring
// Scrape Tempo, Mimir, Loki, Alloy (and Grafana)
prometheus.scrape "sdc_monitoring_infra" {
    // The targets array allows us to specify which service targets to scrape from.
    // Define the address to scrape from, and add a 'group' and 'service' label for each target.
    scheme = "https"
    targets = [
        {"__address__" = sys.env("MIMIR_HOST") + ":9009", group = "infrastructure", service = "mimir"},
        {"__address__" = sys.env("TEMPO_HOST") + ":3200", group = "infrastructure", service = "tempo"},
        {"__address__" = sys.env("LOKI_HOST") + ":3100",  group = "infrastructure", service = "loki"},
        {"__address__" = "localhost:12345",               group = "infrastructure", service = "alloy"},
        // {"__address__" = sys.env("GRAFANA_HOST")+":"+sys.env("GRAFANA_PORT"), group = "infrastructure", service = "grafana"},
    ]
    // The job name to add to the scraped metrics.
    job_name = "sdc_monitoring_infra"
    tls_config {
      cert_file = "/etc/certs/monitoring.crt"
      key_file  = "/etc/certs/monitoring.key"
      ca_file   = "/etc/certs/monitoring.crt"
      insecure_skip_verify = true
    }
    // Scrape all of these services every 15 seconds.
    scrape_interval = "15s"
    // Send the metrics to the prometheus remote write receiver for exporting to Mimir.
    forward_to = [otelcol.receiver.prometheus.prom_ingest.receiver]
}

// Ajout d'une règle de relabeling pour s'assurer que les métriques Docker sont correctement identifiées
prometheus.relabel "docker_metrics" {
  forward_to = [otelcol.processor.memory_limiter.default.input]
  
  rule {
    source_labels = ["job"]
    regex = "sdc_monitoring_host"
    action = "keep"
  }

  rule {
    target_label = "source"
    replacement = "docker_host"
  }
}

otelcol.processor.memory_limiter "default" {
  check_interval    = "1s"
  limit             = "400MiB"

  output {
    metrics = [otelcol.processor.batch.default.input]
    logs    = [otelcol.processor.batch.default.input]
    traces  = [otelcol.processor.batch.default.input]
  }
}

otelcol.processor.batch "default" {
  timeout = "10s"
  send_batch_size = 10000

  output {
    metrics = [otelcol.exporter.prometheus.default.input]
    logs    = [otelcol.exporter.loki.default.input]
    traces  = [otelcol.exporter.otlp.default.input]
  }
}

// Exports 
// Loki export
otelcol.exporter.loki "default" {
  forward_to = [loki.write.default.receiver]
}

loki.write "default" {
  endpoint {
    url = "https://"+sys.env("LOKI_HOST")+":3100/loki/api/v1/push"
    tls_config {
      cert_file = "/etc/certs/monitoring.crt"
      key_file  = "/etc/certs/monitoring.key"
      ca_file   = "/etc/certs/monitoring.crt"
      insecure_skip_verify = true
    }
  }
}

// Tempo export
otelcol.exporter.otlp "default" {
    client {
      endpoint = sys.env("TEMPO_HOST") + ":4317"
      tls {
        cert_file = "/etc/certs/monitoring.crt"
        key_file  = "/etc/certs/monitoring.key"  
        ca_file   = "/etc/certs/monitoring.crt"
        insecure_skip_verify = true
      }
    }
}

// Mimir export
otelcol.exporter.prometheus "default" {
  forward_to = [prometheus.remote_write.default.receiver]
}

prometheus.remote_write "default" {
  endpoint {
    url = "https://"+sys.env("MIMIR_HOST")+":9009/api/v1/push"
    tls_config {
      cert_file = "/etc/certs/monitoring.crt"
      key_file  = "/etc/certs/monitoring.key"
      ca_file   = "/etc/certs/monitoring.crt"
      insecure_skip_verify = true
    }
  }
}
  alloy_host:
    container_name: ${PROJECT_NAME}-alloy-host
    hostname: ${PROJECT_NAME}-alloy-host
    image: grafana/alloy:latest
    # command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config_host.alloy
    command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config_host.alloy --stability.level=experimental 
    environment:
      ALLOY_HOST: ${ALLOY_HOST}
    volumes:
      - ../host/alloy:/etc/alloy
      # CAdvisor mounts
      - /:/rootfs
      - /var/run:/var/run:ro
      - /sys:/rootfs/sys:ro
      - /proc:/rootfs/proc:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /var/run/docker.sock:/rootfs/var/run/docker.sock
      - /dev/disk/:/dev/disk:ro
      # - /var/run/docker/metrics.sock:/host/var/run/docker/metrics.sock:rw
      - ${CERTIFICATE_DIRECTORY}:/etc/certs
    networks:
      - internal
    privileged: true

volumes:
  alloy-data:

networks:
  internal:
    name: ${PROJECT_NAME}-internal-network-${ENV}
    external: true