I’m enhancing a microservice deployment of Loki already in place.
We have our logs on S3 (2.0TB of logs are stored, ingestion of around 30Gb/day).
When I try to query 6 months back on my dashboard, I get slow responses or 502. The dashboard has 11 Visualizations with graphs and tables.
By increasing A LOT of my resources, I get a bit of better performance, but we are not happy with the costs and the performance we are getting. Can someone help me with tips for my helm values.yaml configs to enhance performance and lower the resources at the same time?
We are getting a 30-day query in around 2 minutes; our goal is to lower it down to 30 seconds.
values.yaml:
loki:
limits_config:
max_streams_per_user: 1000000
max_line_size_truncate: true
max_label_name_length: 5048
max_label_value_length: 5120
increment_duplicate_timestamp: true
max_query_series: 1000000
max_query_parallelism: 5000
query_timeout: 15m
max_query_length: 0s
ingestion_rate_mb: 10
tsdb_max_query_parallelism: 5000
discover_log_levels: false
per_stream_rate_limit: 10MB
per_stream_rate_limit_burst: 50MB
retention_period: 9600h # 400 days
split_queries_by_interval: 12h
increment_duplicate_timestamp: true
max_concurrent_tail_requests: 150
per_tenant_override_config: /etc/overrides.yaml
schemaConfig:
configs:
- from: 2024-04-01
store: tsdb
object_store: s3
schema: v13
index:
prefix: loki_index_
period: 24h
compactor:
working_directory: /var/loki/data/compactor
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 0s
retention_delete_worker_count: 150
delete_request_store: aws
ingester:
chunk_encoding: snappy
querier:
# Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing
max_concurrent: 15
tracing:
enabled: true
memcached:
chunk_cache:
enabled: true
service: "memcache"
results_cache:
enabled: true
service: "memcache"
default_validity: 12h
storage:
type: "s3"
s3:
region: us-east-1
bucketNames:
chunks: myapp-loki
ruler: myapp-loki
admin: myapp-loki
structuredConfig:
chunk_store_config:
skip_query_writeback_cache_older_than: 168h
l2_chunk_cache_handoff: 168h
chunk_cache_config_l2:
background:
writeback_goroutines: 10
memcached:
expiration: 168h
batch_size: 200
parallelism: 200
memcached_client:
service: "memcache"
chunk_cache_config:
memcached:
expiration: 168h
batch_size: 200
parallelism: 200
gateway:
resources:
requests:
cpu: "500m"
memory: "2Gi"
limits:
cpu: "500m"
memory: "2Gi"
replicas: 4
autoscaling:
# -- Enable autoscaling for the gateway
enabled: true
# -- Minimum autoscaling replicas for the gateway
minReplicas: 4
# -- Maximum autoscaling replicas for the gateway
maxReplicas: 6
# -- Target CPU utilisation percentage for the gateway
targetCPUUtilizationPercentage: 60
# -- Target memory utilisation percentage for the gateway
targetMemoryUtilizationPercentage:
# -- See `kubectl explain deployment.spec.strategy` for more
# -- ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy
# -- Behavior policies while scaling.
behavior:
scaleUp:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 1
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 1
periodSeconds: 180
ingress:
# -- Specifies whether an ingress for the gateway should be created
enabled: true
# -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18
ingressClassName: "alb"
# -- Annotations for the gateway ingress
annotations:
external-dns.alpha.kubernetes.io/hostname: gateway.loki.myapp.int
alb.ingress.kubernetes.io/scheme: internal
alb.ingress.kubernetes.io/security-groups: SECURITY_GROUP_ID
alb.ingress.kubernetes.io/manage-backend-security-group-rules: "false"
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 8080}]'
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/ip-address-type: ipv4
alb.ingress.kubernetes.io/target-group-attributes: deregistration_delay.timeout_seconds=30,slow_start.duration_seconds=0,load_balancing.algorithm.type=least_outstanding_requests
alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=900
nginx.ingress.kubernetes.io/proxy-read-timeout: "900"
nginx.ingress.kubernetes.io/proxy-send-timeout: "900"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "900"
# -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating
hosts:
- host: gateway.loki.myapp.int
paths:
- path: /
# -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers
pathType: Prefix
chunksCache:
allocatedMemory: 30000
defaultValidity: 24h
parallelism: 200
batchSize: 1000
replicas: 4
enabled: true
resources:
requests:
cpu: "1"
memory: "30Gi"
limits:
cpu: "1"
memory: "30Gi"
deploymentMode: Distributed
ingester:
maxUnavailable: 1
replicas: 15
resources:
requests:
cpu: "4"
memory: "8Gi"
limits:
cpu: "4"
memory: "8Gi"
querier:
replicas: 40
maxUnavailable: 1
resources:
requests:
cpu: "14"
memory: "14Gi"
limits:
cpu: "14"
memory: "14Gi"
queryFrontend:
maxUnavailable: 1
replicas: 10
resources:
requests:
cpu: "8"
memory: "8Gi"
limits:
cpu: "8"
memory: "8Gi"
cache:
enabled: true
memcached:
service: "memcache"
queryScheduler:
maxUnavailable: 1
replicas: 5
resources:
requests:
cpu: "1"
memory: "500Mi"
limits:
cpu: "1"
memory: "500Mi"
distributor:
replicas: 4
maxUnavailable: 1
resources:
requests:
cpu: "2"
memory: "500Mi"
limits:
cpu: "2"
memory: "500Mi"
indexGateway:
replicas: 40
maxUnavailable: 1
resources:
requests:
cpu: "4"
memory: "16Gi"
limits:
cpu: "4"
memory: "16Gi"
minio:
enabled: false
serviceAccount:
annotations:
"eks.amazonaws.com/role-arn": ROLE_ARN
compactor:
replicas: 0
maxUnavailable: 1
resources:
requests:
cpu: "2"
memory: "10Gi"
# Zero out replica counts of other deployment modes
singleBinary:
replicas: 0
bloomBuilder:
replicas: 0
bloomCompactor:
replicas: 0
bloomGateway:
replicas: 0
write:
replicas: 0
read:
replicas: 0
backend:
replicas: 0