Hi all!
I try to implement beyla.ebpf.
On the metric side everything works perfectly but when I try to to get traces working I only get traces ID’s with single span and not end to end flow of the request that I expect to see.
I tried to “overkill” permission but nothing seems to help, in the pod log I didn’t see any errors.
When I run it in debug mode I see that alloy recognize the traceparent
and if not he creates one.
2025-05-20 18:55:50.52065550 (3.719634ms[3.715762ms]) HTTP 200 POST /push.v1.PusherService/Push [ as :994]->[ as .monitoring:0] size:21327B svc=[monitoring/pyroscope go] traceparent=[00-4bceb9958abac43a478baa300ff0713d-2e896b9fe03aff27[0000000000000000]-01]
2025-05-20 18:55:50.52065550 (863.98µs[863.98µs]) HTTPClient 200 POST /ingester.v1.IngesterService/Push [ as .monitoring:0]->[ as :0] size:3316B svc=[monitoring/pyroscope go] traceparent=[00-884385e26df50018701692aacdda8389-2a62c442a8f21cd7[159166e53bea41ca]-01]
2025-05-20 18:55:50.52065550 (1.64119ms[1.63711ms]) HTTP 200 POST /push.v1.PusherService/Push [ as :994]->[ as .monitoring:0] size:3669B svc=[monitoring/pyroscope go] traceparent=[00-884385e26df50018701692aacdda8389-159166e53bea41ca[0000000000000000]-01]
2025-05-20 18:55:50.52065550 (1.567125ms[1.567125ms]) HTTPClient 200 POST /ingester.v1.IngesterService/Push [ as .monitoring:0]->[ as :0] size:1125B svc=[monitoring/pyroscope go] traceparent=[00-58a06111affd7e244aacdcd3ea232657-863a816afda45b44[6790f717ee17ba09]-01]
thats how i see the traces in Grafana:
this is my value chart for the alloy:
# -- Overrides the chart's name. Used to change the infix in the resource names.
nameOverride: null
# -- Overrides the chart's computed fullname. Used to change the full prefix of
# resource names.
fullnameOverride: null
## Global properties for image pulling override the values defined under `image.registry` and `configReloader.image.registry`.
## If you want to override only one image registry, use the specific fields but if you want to override them all, use `global.image.registry`
global:
image:
# -- Global image registry to use if it needs to be overriden for some specific use cases (e.g local registries, custom images, ...)
registry: ""
# -- Optional set of global image pull secrets.
pullSecrets: []
# -- Security context to apply to the Grafana Alloy pod.
podSecurityContext: {}
crds:
# -- Whether to install CRDs for monitoring.
create: false
## Various Alloy settings. For backwards compatibility with the grafana-agent
## chart, this field may also be called "agent". Naming this field "agent" is
## deprecated and will be removed in a future release.
alloy:
configMap:
# -- Create a new ConfigMap for the config file.
create: true
# -- Content to assign to the new ConfigMap. This is passed into `tpl` allowing for templating from values.
content: ''
# -- Name of existing ConfigMap to use. Used when create is false.
name: null
# -- Key in ConfigMap to get config from.
key: null
clustering:
# -- Deploy Alloy in a cluster to allow for load distribution.
enabled: false
# -- Minimum stability level of components and behavior to enable. Must be
# one of "experimental", "public-preview", or "generally-available".
stabilityLevel: "public-preview"
# -- Path to where Grafana Alloy stores data (for example, the Write-Ahead Log).
# By default, data is lost between reboots.
storagePath: /tmp/alloy
# -- Address to listen for traffic on. 0.0.0.0 exposes the UI to other
# containers.
listenAddr: 0.0.0.0
# -- Port to listen for traffic on.
listenPort: 12345
# -- Scheme is needed for readiness probes. If enabling tls in your configs, set to "HTTPS"
listenScheme: HTTP
# -- Base path where the UI is exposed.
uiPathPrefix: /
# -- Enables sending Grafana Labs anonymous usage stats to help improve Grafana
# Alloy.
enableReporting: false
# -- Extra environment variables to pass to the Alloy container.
extraEnv:
- name: ALLOY_NODE
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: ALLOY_POD
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: ALLOY_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: ALLOY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: BEYLA_BPF_CONTEXT_PROPAGATION
value: "all"
# -- Maps all the keys on a ConfigMap or Secret as environment variables. https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#envfromsource-v1-core
envFrom: []
# -- Extra args to pass to `alloy run`: https://grafana.com/docs/alloy/latest/reference/cli/run/
extraArgs: []
# -- Extra ports to expose on the Alloy container.
extraPorts: []
# - name: "faro"
# port: 12347
# targetPort: 12347
# protocol: "TCP"
mounts:
# -- Mount /var/log from the host into the container for log collection.
varlog: false
# -- Mount /var/lib/docker/containers from the host into the container for log
# collection.
dockercontainers: false
# -- Extra volume mounts to add into the Grafana Alloy container. Does not
# affect the watch container.
extra:
- name: cgroup
mountPath: /sys/fs/cgroup
readOnly: true
- name: security
mountPath: /sys/kernel/security
# -- Security context to apply to the Grafana Alloy container.
securityContext:
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- NET_RAW
- CHECKPOINT_RESTORE
- DAC_READ_SEARCH
- BPF
- PERFMON
- NET_ADMIN
- CAP_BPF
- CAP_NET_RAW
- CAP_NET_ADMIN
- CAP_PERFMON
- CAP_DAC_READ_SEARCH
- CAP_CHECKPOINT_RESTORE
- CAP_SYS_PTRACE
- CAP_SYS_RESOURCE
- CAP_SYS_ADMIN
privileged: true
runAsGroup: 0
runAsUser: 0
appArmorProfile:
type: Unconfined
# -- Resource requests and limits to apply to the Grafana Alloy container.
resources: {}
image:
# -- Grafana Alloy image registry (defaults to docker.io)
registry: "docker-live.artifactory.dev.lala.com/dockerhub"
# -- Grafana Alloy image repository.
repository: grafana/alloy
# -- (string) Grafana Alloy image tag. When empty, the Chart's appVersion is
# used.
tag: v1.8.0
# -- Grafana Alloy image's SHA256 digest (either in format "sha256:XYZ" or "XYZ"). When set, will override `image.tag`.
digest: null
# -- Grafana Alloy image pull policy.
pullPolicy: IfNotPresent
# -- Optional set of image pull secrets.
pullSecrets:
- name: docker-live
rbac:
# -- Whether to create RBAC resources for Alloy.
create: true
serviceAccount:
# -- Whether to create a service account for the Grafana Alloy deployment.
create: true
# -- Additional labels to add to the created service account.
additionalLabels: {}
# -- Annotations to add to the created service account.
annotations: {}
# -- The name of the existing service account to use when
# serviceAccount.create is false.
name: null
# Options for the extra controller used for config reloading.
configReloader:
# -- Enables automatically reloading when the Alloy config changes.
enabled: true
image:
# -- Config reloader image registry (defaults to docker.io)
registry: "docker-live.artifactory.dev.lala.com/ghcr.io"
# -- Repository to get config reloader image from.
repository: jimmidyson/configmap-reload
# -- Tag of image to use for config reloading.
tag: v0.12.0
# -- SHA256 digest of image to use for config reloading (either in format "sha256:XYZ" or "XYZ"). When set, will override `configReloader.image.tag`
digest: ""
# -- Override the args passed to the container.
customArgs: []
# -- Resource requests and limits to apply to the config reloader container.
resources:
requests:
cpu: "1m"
memory: "5Mi"
# -- Security context to apply to the Grafana configReloader container.
securityContext: {}
controller:
# -- Type of controller to use for deploying Grafana Alloy in the cluster.
# Must be one of 'daemonset', 'deployment', or 'statefulset'.
type: 'daemonset'
# -- Number of pods to deploy. Ignored when controller.type is 'daemonset'.
replicas: 1
# -- Annotations to add to controller.
extraAnnotations: {}
# -- Whether to deploy pods in parallel. Only used when controller.type is
# 'statefulset'.
parallelRollout: true
# -- Configures Pods to use the host network. When set to true, the ports that will be used must be specified.
hostNetwork: true
# -- Configures Pods to use the host PID namespace.
hostPID: true
# -- Configures the DNS policy for the pod. https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/#pod-s-dns-policy
dnsPolicy: ClusterFirstWithHostNet
# -- Update strategy for updating deployed Pods.
updateStrategy:
type: RollingUpdate
# -- nodeSelector to apply to Grafana Alloy pods.
nodeSelector:
kubernetes.io/os: linux
# -- Tolerations to apply to Grafana Alloy pods.
tolerations:
- operator: Exists
# -- Topology Spread Constraints to apply to Grafana Alloy pods.
topologySpreadConstraints: []
# -- priorityClassName to apply to Grafana Alloy pods.
priorityClassName: ''
# -- Extra pod annotations to add.
podAnnotations: {}
# -- Extra pod labels to add.
podLabels:
Product: DevOps
ProductComponents: Monitoring
service: grafana-alloy-ebpf
# -- Whether to enable automatic deletion of stale PVCs due to a scale down operation, when controller.type is 'statefulset'.
enableStatefulSetAutoDeletePVC: false
autoscaling:
# -- Creates a HorizontalPodAutoscaler for controller type deployment.
enabled: false
# -- The lower limit for the number of replicas to which the autoscaler can scale down.
minReplicas: 1
# -- The upper limit for the number of replicas to which the autoscaler can scale up.
maxReplicas: 5
# -- Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling.
targetCPUUtilizationPercentage: 0
# -- Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling.
targetMemoryUtilizationPercentage: 80
scaleDown:
# -- List of policies to determine the scale-down behavior.
policies: []
# - type: Pods
# value: 4
# periodSeconds: 60
# -- Determines which of the provided scaling-down policies to apply if multiple are specified.
selectPolicy: Max
# -- The duration that the autoscaling mechanism should look back on to make decisions about scaling down.
stabilizationWindowSeconds: 300
scaleUp:
# -- List of policies to determine the scale-up behavior.
policies: []
# - type: Pods
# value: 4
# periodSeconds: 60
# -- Determines which of the provided scaling-up policies to apply if multiple are specified.
selectPolicy: Max
# -- The duration that the autoscaling mechanism should look back on to make decisions about scaling up.
stabilizationWindowSeconds: 0
# -- Affinity configuration for pods.
affinity: {}
volumes:
# -- Extra volumes to add to the Grafana Alloy pod.
extra:
- name: cgroup
hostPath:
path: /sys/fs/cgroup
type: Directory
- name: security
hostPath:
path: /sys/kernel/security
type: Directory
# -- volumeClaimTemplates to add when controller.type is 'statefulset'.
volumeClaimTemplates: []
## -- Additional init containers to run.
## ref: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/
##
initContainers: []
# -- Additional containers to run alongside the Alloy container and initContainers.
extraContainers: []
service:
# -- Creates a Service for the controller's pods.
enabled: true
# -- Service type
type: ClusterIP
# -- NodePort port. Only takes effect when `service.type: NodePort`
nodePort: 31128
# -- Cluster IP, can be set to None, empty "" or an IP address
clusterIP: ''
# -- Value for internal traffic policy. 'Cluster' or 'Local'
internalTrafficPolicy: Cluster
annotations: {}
# cloud.google.com/load-balancer-type: Internal
serviceMonitor:
enabled: true
# -- Additional labels for the service monitor.
additionalLabels: {}
# -- Scrape interval. If not set, the Prometheus default scrape interval is used.
interval: ""
# -- MetricRelabelConfigs to apply to samples after scraping, but before ingestion.
# ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig
metricRelabelings: []
# - action: keep
# regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+'
# sourceLabels: [__name__]
# -- Customize tls parameters for the service monitor
tlsConfig: {}
# -- RelabelConfigs to apply to samples before scraping
# ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#relabelconfig
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# targetLabel: nodename
# replacement: $1
# action: replace
ingress:
# -- Enables ingress for Alloy (Faro port)
enabled: false
# For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName
# See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress
# ingressClassName: nginx
# Values can be templated
annotations:
{}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
labels: {}
path: /
faroPort: 12345
# pathType is only for k8s >= 1.1=
pathType: Prefix
hosts:
- chart-example.local
## Extra paths to prepend to every host configuration. This is useful when working with annotation based services.
extraPaths: []
# - path: /*
# backend:
# serviceName: ssl-redirect
# servicePort: use-annotation
## Or for k8s > 1.19
# - path: /*
# pathType: Prefix
# backend:
# service:
# name: ssl-redirect
# port:
# name: use-annotation
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local
and my alloy config:
discovery.kubernetes "k8s" {
selectors {
field = "spec.nodeName=" + env("HOSTNAME")
role = "pod"
}
namespaces {
names = ["kube-system", "canary-checker", "cert-manager", "external-dns", "external-secrets", "gatekeeper-system", "gpu-resources", "keda", "kubecost", "kubernetes-dashboard", "kyverno", "policy-reporter", "robusta", "spot-handler", "velero", "vault", "opencost", "dask-operator", "istio-system", "knative-eventing", "knative-serving", "kubeflow", "kubescape", "azure-workload-identity-system"]
}
role = "pod"
}
discovery.relabel "k8s" {
targets = discovery.kubernetes.k8s.targets
rule {
action = "drop"
regex = "Succeeded|Failed|Completed"
source_labels = ["__meta_kubernetes_pod_phase"]
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_node_name"]
target_label = "node"
}
rule {
action = "replace"
source_labels = ["__meta_kubernetes_pod_label_version"]
target_label = "version"
}
rule {
action = "replace"
regex = "(.*)@(.*)"
replacement = "ebpf.${1}.${2}"
separator = "@"
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
target_label = "service_name"
}
}
pyroscope.ebpf "k8s" {
forward_to = [pyroscope.write.k8s.receiver]
targets = discovery.relabel.k8s.output
}
pyroscope.write "k8s" {
endpoint {
url = "http://grafana-pyroscope-distributed-distributor.monitoring.svc.cluster.local:4040"
headers = {
"X-Scope-OrgID" = "k8s",
}
}
}
livedebugging {
enabled = true
}
prometheus.exporter.unix "k8s" {}
prometheus.scrape "k8s" {
targets = prometheus.exporter.unix.k8s.targets
forward_to = [prometheus.relabel.k8s.receiver]
}
prometheus.relabel "k8s" {
forward_to = [prometheus.remote_write.k8s.receiver]
rule {
action = "replace"
replacement = "node-exporter"
target_label = "job"
}
rule {
action = "replace"
replacement = sys.env("ALLOY_NODE")
target_label = "node"
}
rule {
action = "replace"
replacement = sys.env("ALLOY_NODE")
target_label = "kubernetes_io_hostname"
}
rule {
action = "replace"
replacement = sys.env("ALLOY_POD")
target_label = "pod"
}
rule {
action = "replace"
replacement = sys.env("ALLOY_NAMESPACE")
target_label = "namespace"
}
rule {
action = "replace"
replacement = "alloy-exporter"
target_label = "container"
}
rule {
action = "replace"
replacement = "alloy-exporter"
target_label = "service"
}
rule {
action = "replace"
replacement = sys.env("ALLOY_POD_IP")
target_label = "instance"
}
}
prometheus.remote_write "k8s" {
endpoint {
url = "http://grafana-mimir-gateway/api/v1/push"
headers = {
"X-Scope-OrgID" = "k8s",
}
}
}
beyla.ebpf "k8s" {
enforce_sys_caps = true
ebpf {
track_request_headers = true
enable_context_propagation = true
}
routes {
unmatched = "heuristic"
}
attributes {
kubernetes {
enable = "true"
}
}
discovery {
services {
kubernetes{
namespace = "time|open|techfoundation|dealcloud|monitoring"
}
exe_path = ".*"
open_ports = "0-65536"
}
}
metrics {
features = [
"network",
"application",
"application_span",
"application_service_graph",
"application_process",
]
instrumentations = ["*"]
}
output {
traces = [
otelcol.connector.servicegraph.k8s.input,
otelcol.processor.filter.k8s.input,
]
}
}
otelcol.processor.filter "k8s" {
error_mode = "ignore"
traces {
span = [
"name == \"get /health|health.*\"",
"name == \"HTTP GET - metrics\"",
"name == \"/thanos.Store/Info\"",
"name == \"/thanos.info.Info/Info\"",
"name == \"/prometheus\"",
"name == \"/api/traces\"",
"name == \"/metrics\"",
"name == \"async\"",
"name == \"/actuator/info\"",
"name == \".*/healthz\"",
"name == \"HealthcheckAction\"",
"name == \"ingress-nginx: GET nginx-internal-.*\"",
"name == \"/grpc.health.v1.Health/Check\"",
"attributes[\"http.user_agent\"] == \"kube-probe/1.30\"",
"attributes[\"user_agent.original\"] == \"kube-probe/1.30\"",
"attributes[\"http.host\"] == \"nginx-internal-.*\"",
]
}
output {
traces = [otelcol.processor.probabilistic_sampler.k8s.input]
}
}
otelcol.processor.probabilistic_sampler "k8s" {
hash_seed = 223
sampling_percentage = 70
output {
traces = [otelcol.processor.resourcedetection.k8s.input]
}
}
otelcol.processor.resourcedetection "k8s" {
detectors = ["aks", "eks"]
system {
hostname_sources = ["os"]
}
output {
traces = [otelcol.processor.k8sattributes.k8s.input]
}
}
otelcol.processor.k8sattributes "k8s" {
extract {
metadata = ["k8s.namespace.name","k8s.pod.name","k8s.deployment.name","k8s.statefulset.name","k8s.daemonset.name","k8s.cronjob.name","k8s.job.name","k8s.node.name","k8s.pod.uid","k8s.pod.start_time"]
}
pod_association {
source {
from = "connection"
}
}
output {
traces = [
otelcol.processor.transform.k8s.input,
otelcol.connector.host_info.k8s.input,
]
}
}
otelcol.connector.host_info "k8s" {
host_identifiers = [ "k8s.node.name" ]
output {
metrics = [otelcol.processor.batch.batch_processor.input]
}
}
otelcol.processor.transform "k8s" {
error_mode = "ignore"
trace_statements {
context = "resource"
statements = []
}
output {
traces = [otelcol.processor.batch.batch_processor.input]
}
}
otelcol.processor.batch "batch_processor" {
send_batch_size = 16384
send_batch_max_size = 0
timeout = "2s"
output {
traces = [otelcol.exporter.otlp.traces_service.input]
}
}
prometheus.scrape "beyla" {
targets = beyla.ebpf.k8s.targets
forward_to = [prometheus.remote_write.k8s.receiver]
}
otelcol.connector.servicegraph "k8s" {
dimensions = ["http.method", "http.target"]
output {
metrics = [otelcol.exporter.prometheus.k8s.input]
}
}
otelcol.exporter.prometheus "k8s" {
forward_to = [prometheus.remote_write.k8s.receiver]
}
otelcol.exporter.otlp "traces_service" {
client {
endpoint = "grafana-tempo-distributed-distributor.monitoring.svc.cluster.local:4317"
tls {
insecure = true
insecure_skip_verify = true
}
}
}