Hi team,
I’m running Loki using the SimpleScalable mode on EKS, ingesting around 10 million log lines per day (~5GB/day).
However, query performance is significantly slower compared to Grafana Loki Cloud, even though I’ve already tuned many parameters.
Below is my current configuration (simplified). I would like guidance on whether any of these settings are incorrect or not. Thank you
--- deploymentMode: SimpleScalable global: dnsNamespace: kube-system clusterDomain: cluster.local loki: server: grpc_server_max_recv_msg_size: 209715200 #200MB grpc_server_max_send_msg_size: 209715200 #200MB grpc_server_max_concurrent_streams: 2000 grpc_server_keepalive_timeout: 60s grpc_server_num_workers: 8 http_server_read_timeout: 2m http_server_write_timeout: 2m http_server_idle_timeout: 2m auth_enabled: false schemaConfig: configs: - from: 2024-04-01 store: tsdb object_store: s3 schema: v13 index: prefix: loki_index_ period: 1d commonConfig: replication_factor: 2 storage: type: s3 bucketNames: chunks: grafana-loki-abcdef ruler: grafana-loki-abcdef #admin: grafana-loki-abcdef s3: region: ap-southeast-3 insecure: false compactor: # Need to be enabled for retention to work retention_enabled: true # how long to wait before deleting the data retention_delete_delay: 2h delete_request_store: s3 bloom_build: enabled: true planner: planning_interval: 3h builder: planner_address: loki-backend.loki.svc.cluster.local:9095 # Configuration block for bloom filtering. bloom_gateway: enabled: true client: addresses: loki-backend.loki.svc.cluster.local:9095 limits_config: bloom_creation_enabled: true bloom_split_series_keyspace_by: 1024 bloom_gateway_enable_filtering: true ingestion_rate_strategy: local shard_streams: enabled: true desired_rate: 4194304 #4MiB allow_structured_metadata: true discover_log_levels: true #discover_service_name: [kubernetes_container_name] volume_enabled: true retention_period: 30d # 28 days retention split_queries_by_interval: 1d unordered_writes: true max_query_series: 1000000 # 10000 unique series by metric query max_streams_per_user: 1000 # 1000 streams per user max_entries_limit_per_query: 10000 max_global_streams_per_user: 1000000 max_query_parallelism: 240 # Maximum number of queries that can be executed in parallel max_concurrent_tail_requests: 500 # Maximum number of concurrent tail requests tsdb_max_query_parallelism: 400 # Maximum number of queries that can be executed in parallel for TSDB tsdb_max_bytes_per_shard: 2GB # Maximum bytes per shard for TSDB max_line_size: 256000 # 256KB max_line_size_truncate: true # Truncate lines that are too long # Precompute chunks for TSDB, This can improve query performance at the # cost of increased memory usage by computing chunks once during planning, # reducing index calls. tsdb_precompute_chunks: true query_timeout: 10m # 10 minutes ingestion_rate_mb: 20 # Increase per-tenant rate (MB/sec) ingestion_burst_size_mb: 30 # Allow short bursts per_stream_rate_limit: 30MB # Maximum byte rate per second per stream per_stream_rate_limit_burst: 50MB # Maximum burst bytes per stream max_query_bytes_read: 600GB # Maximum bytes read per query max_querier_bytes_read: 600GB # Maximum bytes read per querier max_query_lookback: 7d # Maximum lookback period for queries memcached: chunk_cache: enabled: true host: loki-chunks-cache.loki.svc.cluster.local service: loki-chunks-cache batch_size: 256 parallelism: 20 results_cache: enabled: true host: loki-results-cache.loki.svc.cluster.local service: loki-results-cache default_validity: 12h ingester: chunk_encoding: snappy chunk_block_size: 262144 # 512KiB chunk_target_size: 2097152 # 2MiB lifecycler: # Consider setting this to false to only check the health of the ingester itself, reducing the waiting time readiness_check_ring_health: false tracing: enabled: false pattern_ingester: enabled: true frontend: log_queries_longer_than: 5s grpc_client_config: grpc_compression: snappy max_recv_msg_size: 2097152000 #200MB max_send_msg_size: 2097152000 #200MB query_range: align_queries_with_step: true cache_results: true cache_instant_metric_results: true parallelise_shardable_queries: true max_retries: 5 volume_results_cache: cache: memcached: batch_size: 64 parallelism: 8 memcached_client: addresses: loki-results-cache.loki.svc.cluster.local:11211 max_idle_conns: 16 timeout: 200ms results_cache: cache: # We're going to use the in-process "FIFO" cache enable_fifocache: true fifocache: max_size_bytes: 2GB validity: 12h compression: snappy querier: # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing max_concurrent: 100 query_scheduler: max_outstanding_requests_per_tenant: 1000 grpc_client_config: max_recv_msg_size: 1.048576e+11 max_send_msg_size: 1.048576e+11 chunksCache: enabled: true replicas: 1 defaultValidity: 1h maxItemMemory: 10 persistence: enabled: true storageClass: ebs-sc-encrypted size: 10Gi # allocated memory for chunks cache # https://github.com/grafana/loki/blob/main/production/helm/loki/templates/memcached/_memcached-statefulset.tpl#L101 allocatedMemory: 13000 # 13GB affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: topology.kubernetes.io/zone operator: In values: - ap-southeast-3c resultsCache: enabled: true maxItemMemory: 10 persistence: enabled: true storageClass: ebs-sc-encrypted size: 10Gi resources: limits: cpu: 1 memory: 1Gi requests: cpu: 5m memory: 50Mi affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: topology.kubernetes.io/zone operator: In values: - ap-southeast-3c memcached: extraArgs: log.level: warn resources: limits: cpu: 1 memory: 8Gi requests: cpu: 5m memory: 50Mi memcachedExporter: enabled: true extraArgs: log.level: info resources: limits: cpu: 1 memory: 2Gi requests: cpu: 5m memory: 50Mi test: enabled: false lokiCanary: enabled: false storage_config: aws: region: ap-southeast-3 bucketnames: grafana-loki-abcdef s3forcepathstyle: false backoff_config: min_period: 2s max_period: 5s max_retries: 5 tsdb_shipper: active_index_directory: /data/loki/tsdb-index cache_location: /data/loki/tsdb-cache cache_ttl: 12h flush_interval: 10m object_store: s3 bucket: grafana-loki-abcdef chunk_store_config: chunk_cache_config: memcached: batch_size: 64 parallelism: 8 memcached_client: addresses: loki-chunks-cache.loki.svc.cluster.local:11211 max_idle_conns: 16 timeout: 200ms serviceAccount: create: true annotations: "eks.amazonaws.com/role-arn": "arn:aws:iam::xxx:role/LokiRole" backend: persistence: enabled: true storageClass: ebs-sc-encrypted size: 10Gi affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: topology.kubernetes.io/zone operator: In values: - ap-southeast-3c replicas: 2 resources: limits: cpu: 1 memory: 4Gi requests: cpu: 10m memory: 50Mi read: autoscaling: enabled: true minReplicas: 3 maxReplicas: 15 targetCPUUtilizationPercentage: 80 targetMemoryUtilizationPercentage: 80 replicas: 15 resources: limits: cpu: 1 memory: 6Gi requests: cpu: 10m memory: 50Mi # affinity: # podAntiAffinity: {} write: persistence: enabled: true storageClass: ebs-sc-encrypted size: 10Gi affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: topology.kubernetes.io/zone operator: In values: - ap-southeast-3c replicas: 2 resources: limits: cpu: 1 memory: 6Gi requests: cpu: 25m memory: 250Mi gateway: autoscaling: enabled: true minReplicas: 2 maxReplicas: 5 targetCPUUtilizationPercentage: 80 targetMemoryUtilizationPercentage: 80 nginxConfig: clientMaxBodySize: 200M affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: topology.kubernetes.io/zone operator: In values: - ap-southeast-3c resources: limits: cpu: 1 memory: 2Gi requests: cpu: 10m memory: 50Mi