Hello, we have started using loki for the logs,and while for shorter time range it works great, but for bigger range, > 7days, we see it being too slow.
We have a nodepool of 3 nodes of 8 Cpu, 32 GB Ram in GKE. the config is below.
I especially want to learn if the tsdb_max_query_parallelism and split_queries_by_interval config is alright for the requirement?
deploymentMode: SimpleScalable
gateway:
tolerations:
- key: "dedicated"
operator: "Equal"
value: "logs"
effect: "NoSchedule"
basicAuth:
enabled: true
loki:
image:
tag: 3.5.1
runtimeConfig:
overrides:
prod:
retention_period: 720h # 30 days
ingestion_rate_mb: 100
ingestion_burst_size_mb: 100
per_stream_rate_limit: 5MB
per_stream_rate_limit_burst: 20MB
max_line_size: 256Kb
max_line_size_truncate: true
server:
# https://grafana.com/docs/tempo/latest/troubleshooting/querying/response-too-large/
grpc_server_max_recv_msg_size: 30485760
grpc_server_max_send_msg_size: 30485760
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
shard_streams:
enabled: true
allow_structured_metadata: true
volume_enabled: true
retention_period: 720h
split_queries_by_interval: 1h
max_query_parallelism: 100
tsdb_max_query_parallelism: 350
max_line_size: 512000
max_line_size_truncate: true
chunk_store_config:
chunk_cache_config:
background:
writeback_buffer: 5000
writeback_goroutines: 1
writeback_size_limit: 500MB
memcached:
batch_size: 3
parallelism: 2
memcached_client:
addresses: dnssrvnoa+_memcached-client._tcp.loki-chunks-cache.logs.svc
consistent_hash: true
max_idle_conns: 72
timeout: 60s
query_range:
parallelise_shardable_queries: true
align_queries_with_step: true
cache_results: true
results_cache:
cache:
background:
writeback_buffer: 500000
writeback_goroutines: 1
writeback_size_limit: 500MB
default_validity: 6h
memcached_client:
addresses: dnssrvnoa+_memcached-client._tcp.loki-results-cache.logs.svc
consistent_hash: true
timeout: 60s
update_interval: 1m
#
# max_item_size: 5m
storage_config:
hedging:
at: 250ms
max_per_second: 20
up_to: 3
auth_enabled: true
frontend_worker:
grpc_client_config:
max_send_msg_size: 30485760
schemaConfig:
configs:
- from: "2024-04-01"
store: tsdb
object_store: gcs
schema: v13
index:
prefix: loki_index_
period: 24h
ingester:
chunk_encoding: snappy
querier:
# Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing
max_concurrent: 9
pattern_ingester:
enabled: true
compactor:
retention_enabled: true
delete_request_store: gcs
storage:
bucketNames:
chunks: chunks
ruler: ruler
type: gcs
gcs:
chunkBufferSize: 0
requestTimeout: "10s"
enableHttp2: true
chunksCache:
allocatedMemory: 4096
maxItemMemory: 2
replicas: 2
tolerations:
- key: "dedicated"
operator: "Equal"
value: "logs"
effect: "NoSchedule"
resultsCache:
allocatedMemory: 4096
defaultValidity: 8h
maxItemMemory: 5
replicas: 2
tolerations:
- key: "dedicated"
operator: "Equal"
value: "logs"
effect: "NoSchedule"
backend:
replicas: 2
tolerations:
- key: "dedicated"
operator: "Equal"
value: "logs"
effect: "NoSchedule"
read:
replicas: 4
tolerations:
- key: "dedicated"
operator: "Equal"
value: "logs"
effect: "NoSchedule"
resources:
requests:
cpu: 1000m
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/component: read
topologyKey: kubernetes.io/hostname
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 50
preference:
matchExpressions:
- key: dedicated
operator: In
values:
- logs
autoscaling:
enabled: true
minReplicas: 4
maxReplicas: 8
targetCPUUtilizationPercentage: 80
behavior:
scaleUp:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 1
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Pods
value: 1
periodSeconds: 180
write:
replicas: 3 # To ensure data durability with replication
tolerations:
- key: "dedicated"
operator: "Equal"
value: "logs"
effect: "NoSchedule"
minio:
enabled: false
lokiCanary:
enabled: false
test:
enabled: false
monitoring:
dashboards:
enabled: true
serviceMonitor:
enabled: true