After upgrading to loki 3.0(enable TSDB) we are facing performance issues with loki. Queries are taking lots of time to complete. It was working fine before upgrade from version 2.9. Even when we look for older data which is not part of TSDB schema, from boltdb shipper, it works fine, querier latency increase 3-4 times after upgrade. Any help/config which can help to improve the query time.
Setup Details: Loki Version: 3.1
Deployment Type: Helm chart Loki Distributed
Ingesters: 7
Distributor: 7
index gateway: 3
Querier - autoscale
Frontend - autoscale
Scheduler - 5
Chunk/result cache - 3 each
Loki Configuration:
auth_enabled: true
chunk_store_config:
cache_lookups_older_than: 2h
chunk_cache_config:
default_validity: 10m
memcached:
batch_size: 256
expiration: 5m
parallelism: 10
memcached_client:
addresses: dnssrv+_memcached-client._tcp.loki-distributed-memcached-chunks.logging.svc.cluster.local
circuit_breaker_consecutive_failures: 10
circuit_breaker_interval: 180s
circuit_breaker_timeout: 180s
consistent_hash: true
max_item_size: 4845728
timeout: 1000ms
write_dedupe_cache_config:
memcached_client:
addresses: dnssrv+_memcached-client._tcp.loki-distributed-memcached-index-writes.logging.svc.cluster.local
consistent_hash: true
common:
compactor_address: http://loki-distributed-compactor:3100
compactor:
delete_request_store: aws
retention_delete_delay: 24h
retention_delete_worker_count: 150
retention_enabled: true
working_directory: /var/loki/retention
distributor:
ring:
kvstore:
store: memberlist
frontend:
compress_responses: true
grpc_client_config:
grpc_compression: snappy
max_recv_msg_size: 16777216000
max_send_msg_size: 16777216000
log_queries_longer_than: 5s
querier_forget_delay: 10s
scheduler_address: loki-distributed-query-scheduler:9095
tail_proxy_url: http://loki-distributed-querier:3100
scheduler_worker_concurrency: 20
frontend_worker:
grpc_client_config:
grpc_compression: snappy
max_recv_msg_size: 16777216000
max_send_msg_size: 16777216000
scheduler_address: loki-distributed-query-scheduler:9095
index_gateway:
mode: ring
ring:
kvstore:
store: memberlist
ingester:
autoforget_unhealthy: true
chunk_block_size: 262144
chunk_encoding: snappy
chunk_idle_period: 5m
chunk_retain_period: 1m
chunk_target_size: 3145728
concurrent_flushes: 48
flush_check_period: 15s
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 1
max_chunk_age: 30m
max_returned_stream_errors: 0
wal:
dir: /var/loki/wal
enabled: true
flush_on_shutdown: true
replay_memory_ceiling: 4GB
ingester_client:
grpc_client_config:
grpc_compression: snappy
max_recv_msg_size: 16777216000
max_send_msg_size: 16777216000
pool_config:
client_cleanup_period: 5s
health_check_ingesters: true
limits_config:
allow_structured_metadata: false
cardinality_limit: 400000
deletion_mode: filter-and-delete
ingestion_burst_size_mb: 148
ingestion_rate_mb: 128
ingestion_rate_strategy: global
max_cache_freshness_per_query: 10m
max_chunks_per_query: 5000000
max_entries_limit_per_query: 10000
max_global_streams_per_user: 20000
max_label_names_per_series: 30
max_line_size: 0
max_querier_bytes_read: 150GB
max_query_parallelism: 256
max_query_series: 50000
max_stats_cache_freshness: 0
max_streams_matchers_per_query: 10000
max_streams_per_user: 0
per_stream_rate_limit: 30MB
per_stream_rate_limit_burst: 50MB
reject_old_samples: false
retention_period: 8760h
split_queries_by_interval: 15m
tsdb_max_bytes_per_shard: 1000MB
tsdb_max_query_parallelism: 1000
tsdb_precompute_chunks: true
unordered_writes: true
memberlist:
join_members:
- loki-distributed-memberlist
querier:
engine:
max_look_back_period: 30s
max_concurrent: 10
multi_tenant_queries_enabled: true
query_ingesters_within: 1h
query_range:
align_queries_with_step: true
cache_results: true
max_retries: 1
parallelise_shardable_queries: true
results_cache:
cache:
memcached:
batch_size: 1024
expiration: 10m
parallelism: 100
memcached_client:
addresses: dnssrv+_memcached-client._tcp.loki-distributed-memcached-frontend.logging.svc.cluster.local
consistent_hash: true
max_item_size: 4597152
timeout: 500ms
update_interval: 1m
query_scheduler:
max_outstanding_requests_per_tenant: 32768
max_queue_hierarchy_levels: 0
querier_forget_delay: 15s
runtime_config:
file: /var/loki-distributed-runtime/runtime.yaml
period: 120s
schema_config:
configs:
- from: "2020-09-07"
index:
period: 24h
prefix: loki_index_
object_store: aws
schema: v11
store: boltdb-shipper
- from: "2024-07-14"
index:
period: 24h
prefix: loki_index_
object_store: s3
schema: v12
store: tsdb
- from: "2024-07-15"
index:
period: 24h
prefix: loki_index_
object_store: s3
schema: v13
store: tsdb
server:
grpc_server_max_concurrent_streams: 10000
grpc_server_max_recv_msg_size: 41943040000
grpc_server_max_send_msg_size: 41943040000
http_listen_port: 3100
http_server_idle_timeout: 300s
http_server_read_timeout: 300s
http_server_write_timeout: 300s
storage_config:
aws:
s3: s3://xxxxx
boltdb_shipper:
active_index_directory: /var/loki/index
cache_location: /var/loki/cache
cache_ttl: 24h
index_gateway_client:
server_address: dns:///loki-distributed-index-gateway:9095
filesystem:
directory: /var/loki/chunks
index_cache_validity: 3m
index_queries_cache_config:
memcached:
batch_size: 100
expiration: 10m
parallelism: 100
memcached_client:
addresses: dnssrv+_memcached-client._tcp.loki-distributed-memcached-index-queries.logging.svc.cluster.local
consistent_hash: true
timeout: 1000ms
tsdb_shipper:
active_index_directory: /var/loki/tsdb-index
cache_location: /var/loki/tsdb-cache
index_gateway_client:
server_address: dns:///loki-distributed-index-gateway.logging.svc.cluster.local:9095