Hi,
We’re having an issue with our Promtail-Loki-Grafana stack, which is configured to get logs mostly from network equipments (often low volumes) and other software mostly with low volume, that’s why we reduced chunks target size (avoid uncomplete chunks)
What we experience is the memory usage from loki going up until OOM when we use playlist mode in grafana, doing a lot of requests, but only on the last 6 hours. We are using a fully local installation with local storage, and we can’t find out the reason why the memory usage go this high while our entire logs for the duration
We think it might be an issue with the uncompressed chunks from requests remaining in memory with duplications, while not cleared. As we stop using playlist mode, the memory usage return to stable. Is there anything about chunks lifecycle we haven’t properly understood?
As well, we notices that we loose the last minutes of logs going OOM, so this is a very concerning situation for us, but we had issues with configuring WAL, this might be related, and this is an other issue we’re facing, not the main topic of this post.
This is the memory usage we see have on the host with the prometeus stack
Here you can see the difference between when we are not doing requests all the time vs when we are. Most of it is used by loki.
What do you think we are experiencing? why is our memory usage this high, while our whole folder for the chunks and boltdb is 1.8 Go for a few month.
Here is loki’s config yalm:
auth_enabled: false
server:
http_listen_port: 3100
ingester:
lifecycler:
address: 0.0.0.0
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 24h # Any chunk not receiving new logs in this time will be flushed
max_chunk_age: 48h # All chunks will be flushed when they hit this age, default is 1h
chunk_target_size: 262144 # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
chunk_retain_period: 20m # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
max_transfer_retries: 0 # Chunk transfers disabled
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /var/local/loki/boltdb-shipper-active
cache_location: /tmp/loki/boltdb-shipper-cache
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: filesystem
filesystem:
directory: /var/local/loki/chunks
compactor:
working_directory: /tmp/loki/boltdb-shipper-compactor
shared_store: filesystem
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: true
retention_period: 365d
ruler:
storage:
type: local
local:
directory: /etc/loki/rules/
rule_path: /etc/loki/rules-temp/
alertmanager_url: http://localhost:9093
ring:
kvstore:
store: inmemory
enable_api: true
enable_alertmanager_v2: true
And here is loki’s config from the view config in command line:
---
# Loki Config
# (version=2.2.1, branch=HEAD, revision=babea82e)
target: all
http_prefix: ""
server:
http_listen_address: ""
http_listen_port: 3100
http_listen_conn_limit: 0
grpc_listen_address: ""
grpc_listen_port: 9095
grpc_listen_conn_limit: 0
http_tls_config:
cert_file: ""
key_file: ""
client_auth_type: ""
client_ca_file: ""
grpc_tls_config:
cert_file: ""
key_file: ""
client_auth_type: ""
client_ca_file: ""
register_instrumentation: true
graceful_shutdown_timeout: 30s
http_server_read_timeout: 30s
http_server_write_timeout: 30s
http_server_idle_timeout: 2m0s
grpc_server_max_recv_msg_size: 4194304
grpc_server_max_send_msg_size: 4194304
grpc_server_max_concurrent_streams: 100
grpc_server_max_connection_idle: 2562047h47m16.854775807s
grpc_server_max_connection_age: 2562047h47m16.854775807s
grpc_server_max_connection_age_grace: 2562047h47m16.854775807s
grpc_server_keepalive_time: 2h0m0s
grpc_server_keepalive_timeout: 20s
grpc_server_min_time_between_pings: 5m0s
grpc_server_ping_without_stream_allowed: false
log_format: logfmt
log_level: info
log_source_ips_enabled: false
log_source_ips_header: ""
log_source_ips_regex: ""
http_path_prefix: ""
distributor:
ring:
kvstore:
store: consul
prefix: collectors/
consul:
host: localhost:8500
acl_token: ""
http_client_timeout: 20s
consistent_reads: false
watch_rate_limit: 1
watch_burst_size: 1
etcd:
endpoints: []
dial_timeout: 10s
max_retries: 10
tls_enabled: false
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
multi:
primary: ""
secondary: ""
mirror_enabled: false
mirror_timeout: 2s
heartbeat_period: 5s
heartbeat_timeout: 1m0s
instance_id: logtest0
instance_interface_names:
- eth0
- en0
instance_port: 0
instance_addr: ""
querier:
query_timeout: 1m0s
tail_max_duration: 1h0m0s
engine:
timeout: 5m0s
max_look_back_period: 30s
max_concurrent: 20
ingester_client:
pool_config:
client_cleanup_period: 15s
health_check_ingesters: true
remote_timeout: 5s
grpc_client_config:
max_recv_msg_size: 104857600
max_send_msg_size: 16777216
grpc_compression: ""
rate_limit: 0
rate_limit_burst: 0
backoff_on_ratelimits: false
backoff_config:
min_period: 100ms
max_period: 10s
max_retries: 10
tls_enabled: false
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
ingester:
lifecycler:
ring:
kvstore:
store: inmemory
prefix: collectors/
consul:
host: localhost:8500
acl_token: ""
http_client_timeout: 20s
consistent_reads: false
watch_rate_limit: 1
watch_burst_size: 1
etcd:
endpoints: []
dial_timeout: 10s
max_retries: 10
tls_enabled: false
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
multi:
primary: ""
secondary: ""
mirror_enabled: false
mirror_timeout: 2s
heartbeat_timeout: 1m0s
replication_factor: 1
zone_awareness_enabled: false
num_tokens: 128
heartbeat_period: 5s
observe_period: 0s
join_after: 0s
min_ready_duration: 1m0s
interface_names:
- eth0
- en0
final_sleep: 0s
tokens_file_path: ""
availability_zone: ""
unregister_on_shutdown: true
address: 0.0.0.0
port: 0
id: logtest0
concurrent_flushes: 16
flush_check_period: 30s
flush_op_timeout: 10s
chunk_retain_period: 20m0s
chunk_idle_period: 24h0m0s
chunk_block_size: 262144
chunk_target_size: 262144
chunk_encoding: gzip
max_chunk_age: 48h0m0s
sync_period: 0s
sync_min_utilization: 0
max_returned_stream_errors: 10
query_store_max_look_back_period: 0s
wal:
enabled: false
dir: wal
checkpoint_duration: 5m0s
flush_on_shutdown: false
replay_memory_ceiling: 4294967296
storage_config:
engine: chunks
aws:
dynamodb:
dynamodb_url: ""
api_limit: 2
throttle_limit: 10
metrics:
url: ""
target_queue_length: 100000
scale_up_factor: 1.3
ignore_throttle_below: 1
queue_length_query: sum(avg_over_time(cortex_ingester_flush_queue_length{job="cortex/ingester"}[2m]))
write_throttle_query: sum(rate(cortex_dynamo_throttled_total{operation="DynamoDB.BatchWriteItem"}[1m]))
by (table) > 0
write_usage_query: sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.BatchWriteItem"}[15m]))
by (table) > 0
read_usage_query: sum(rate(cortex_dynamo_consumed_capacity_total{operation="DynamoDB.QueryPages"}[1h]))
by (table) > 0
read_error_query: sum(increase(cortex_dynamo_failures_total{operation="DynamoDB.QueryPages",error="ProvisionedThroughputExceededException"}[1m]))
by (table) > 0
chunk_gang_size: 10
chunk_get_max_parallelism: 32
backoff_config:
min_period: 100ms
max_period: 50s
max_retries: 20
s3: ""
s3forcepathstyle: false
bucketnames: ""
endpoint: ""
region: ""
access_key_id: ""
secret_access_key: ""
insecure: false
sse_encryption: false
http_config:
idle_conn_timeout: 1m30s
response_header_timeout: 0s
insecure_skip_verify: false
signature_version: v4
sse:
type: ""
kms_key_id: ""
kms_encryption_context: ""
azure:
environment: AzureGlobal
container_name: cortex
account_name: ""
account_key: ""
download_buffer_size: 512000
upload_buffer_size: 256000
upload_buffer_count: 1
request_timeout: 30s
max_retries: 5
min_retry_delay: 10ms
max_retry_delay: 500ms
bigtable:
project: ""
instance: ""
grpc_client_config:
max_recv_msg_size: 104857600
max_send_msg_size: 16777216
grpc_compression: ""
rate_limit: 0
rate_limit_burst: 0
backoff_on_ratelimits: false
backoff_config:
min_period: 100ms
max_period: 10s
max_retries: 10
tls_enabled: true
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
table_cache_enabled: true
table_cache_expiration: 30m0s
gcs:
bucket_name: ""
chunk_buffer_size: 0
request_timeout: 0s
cassandra:
addresses: ""
port: 9042
keyspace: ""
consistency: QUORUM
replication_factor: 3
disable_initial_host_lookup: false
SSL: false
host_verification: true
CA_path: ""
tls_cert_path: ""
tls_key_path: ""
auth: false
username: ""
password: ""
password_file: ""
custom_authenticators: []
timeout: 2s
connect_timeout: 5s
reconnect_interval: 1s
max_retries: 0
retry_max_backoff: 10s
retry_min_backoff: 100ms
query_concurrency: 0
num_connections: 2
convict_hosts_on_failure: true
table_options: ""
boltdb:
directory: ""
filesystem:
directory: /var/local/loki/chunks
swift:
auth_version: 0
auth_url: ""
username: ""
user_domain_name: ""
user_domain_id: ""
user_id: ""
password: ""
domain_id: ""
domain_name: ""
project_id: ""
project_name: ""
project_domain_id: ""
project_domain_name: ""
region_name: ""
container_name: ""
max_retries: 3
connect_timeout: 10s
request_timeout: 5s
index_cache_validity: 5m0s
index_queries_cache_config:
enable_fifocache: false
default_validity: 0s
background:
writeback_goroutines: 10
writeback_buffer: 10000
memcached:
expiration: 0s
batch_size: 1024
parallelism: 100
memcached_client:
host: ""
service: memcached
addresses: ""
timeout: 100ms
max_idle_conns: 16
update_interval: 1m0s
consistent_hash: true
circuit_breaker_consecutive_failures: 10
circuit_breaker_timeout: 10s
circuit_breaker_interval: 10s
redis:
endpoint: ""
master_name: ""
timeout: 500ms
expiration: 0s
db: 0
pool_size: 0
password: ""
tls_enabled: false
tls_insecure_skip_verify: false
idle_timeout: 0s
max_connection_age: 0s
fifocache:
max_size_bytes: ""
max_size_items: 0
validity: 0s
size: 0
prefix: store.index-cache-read.
delete_store:
store: ""
requests_table_name: delete_requests
table_provisioning:
enable_ondemand_throughput_mode: false
provisioned_write_throughput: 1
provisioned_read_throughput: 300
write_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
read_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
tags: {}
grpc_store: {}
max_chunk_batch_size: 50
boltdb_shipper:
active_index_directory: /var/local/loki/boltdb-shipper-active
shared_store: filesystem
cache_location: /tmp/loki/boltdb-shipper-cache
cache_ttl: 24h0m0s
resync_interval: 5m0s
query_ready_num_days: 0
chunk_store_config:
chunk_cache_config:
enable_fifocache: false
default_validity: 0s
background:
writeback_goroutines: 10
writeback_buffer: 10000
memcached:
expiration: 0s
batch_size: 1024
parallelism: 100
memcached_client:
host: ""
service: memcached
addresses: ""
timeout: 100ms
max_idle_conns: 16
update_interval: 1m0s
consistent_hash: true
circuit_breaker_consecutive_failures: 10
circuit_breaker_timeout: 10s
circuit_breaker_interval: 10s
redis:
endpoint: ""
master_name: ""
timeout: 500ms
expiration: 0s
db: 0
pool_size: 0
password: ""
tls_enabled: false
tls_insecure_skip_verify: false
idle_timeout: 0s
max_connection_age: 0s
fifocache:
max_size_bytes: ""
max_size_items: 0
validity: 0s
size: 0
prefix: store.chunks-cache.
write_dedupe_cache_config:
enable_fifocache: false
default_validity: 0s
background:
writeback_goroutines: 10
writeback_buffer: 10000
memcached:
expiration: 0s
batch_size: 1024
parallelism: 100
memcached_client:
host: ""
service: memcached
addresses: ""
timeout: 100ms
max_idle_conns: 16
update_interval: 1m0s
consistent_hash: true
circuit_breaker_consecutive_failures: 10
circuit_breaker_timeout: 10s
circuit_breaker_interval: 10s
redis:
endpoint: ""
master_name: ""
timeout: 500ms
expiration: 0s
db: 0
pool_size: 0
password: ""
tls_enabled: false
tls_insecure_skip_verify: false
idle_timeout: 0s
max_connection_age: 0s
fifocache:
max_size_bytes: ""
max_size_items: 0
validity: 0s
size: 0
prefix: store.index-cache-write.
cache_lookups_older_than: 0s
max_look_back_period: 0s
schema_config:
configs:
- from: "2020-10-24"
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 1d
tags: {}
chunks:
prefix: ""
period: 0s
tags: {}
row_shards: 16
limits_config:
ingestion_rate_strategy: local
ingestion_rate_mb: 4
ingestion_burst_size_mb: 6
max_label_name_length: 1024
max_label_value_length: 2048
max_label_names_per_series: 30
reject_old_samples: true
reject_old_samples_max_age: 168h0m0s
creation_grace_period: 10m0s
enforce_metric_name: true
max_line_size: 0
max_streams_per_user: 10000
max_global_streams_per_user: 0
max_chunks_per_query: 2000000
max_query_series: 500
max_query_lookback: 0s
max_query_length: 0s
max_query_parallelism: 14
cardinality_limit: 100000
max_streams_matchers_per_query: 1000
max_concurrent_tail_requests: 10
max_entries_limit_per_query: 5000
max_cache_freshness_per_query: 1m0s
split_queries_by_interval: 0s
ruler_evaluation_delay_duration: 0s
ruler_max_rules_per_rule_group: 0
ruler_max_rule_groups_per_tenant: 0
per_tenant_override_config: ""
per_tenant_override_period: 10s
table_manager:
throughput_updates_disabled: false
retention_deletes_enabled: true
retention_period: 1y
poll_interval: 2m0s
creation_grace_period: 10m0s
index_tables_provisioning:
enable_ondemand_throughput_mode: false
provisioned_write_throughput: 1000
provisioned_read_throughput: 300
write_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
read_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
enable_inactive_throughput_on_demand_mode: false
inactive_write_throughput: 1
inactive_read_throughput: 300
inactive_write_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
inactive_read_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
inactive_write_scale_lastn: 4
inactive_read_scale_lastn: 4
chunk_tables_provisioning:
enable_ondemand_throughput_mode: false
provisioned_write_throughput: 1000
provisioned_read_throughput: 300
write_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
read_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
enable_inactive_throughput_on_demand_mode: false
inactive_write_throughput: 1
inactive_read_throughput: 300
inactive_write_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
inactive_read_scale:
enabled: false
role_arn: ""
min_capacity: 3000
max_capacity: 6000
out_cooldown: 1800
in_cooldown: 1800
target: 80
inactive_write_scale_lastn: 4
inactive_read_scale_lastn: 4
frontend_worker:
frontend_address: ""
scheduler_address: ""
dns_lookup_duration: 10s
parallelism: 10
match_max_concurrent: false
id: ""
grpc_client_config:
max_recv_msg_size: 104857600
max_send_msg_size: 16777216
grpc_compression: ""
rate_limit: 0
rate_limit_burst: 0
backoff_on_ratelimits: false
backoff_config:
min_period: 100ms
max_period: 10s
max_retries: 10
tls_enabled: false
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
frontend:
log_queries_longer_than: 0s
max_body_size: 10485760
query_stats_enabled: false
max_outstanding_per_tenant: 100
compress_responses: false
downstream_url: ""
tail_proxy_url: ""
ruler:
external_url: ""
ruler_client:
max_recv_msg_size: 104857600
max_send_msg_size: 16777216
grpc_compression: ""
rate_limit: 0
rate_limit_burst: 0
backoff_on_ratelimits: false
backoff_config:
min_period: 100ms
max_period: 10s
max_retries: 10
tls_enabled: false
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
evaluation_interval: 1m0s
poll_interval: 1m0s
storage:
type: local
configdb:
configs_api_url: ""
client_timeout: 5s
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
azure:
environment: AzureGlobal
container_name: cortex
account_name: ""
account_key: ""
download_buffer_size: 512000
upload_buffer_size: 256000
upload_buffer_count: 1
request_timeout: 30s
max_retries: 5
min_retry_delay: 10ms
max_retry_delay: 500ms
gcs:
bucket_name: ""
chunk_buffer_size: 0
request_timeout: 0s
s3:
s3: ""
s3forcepathstyle: false
bucketnames: ""
endpoint: ""
region: ""
access_key_id: ""
secret_access_key: ""
insecure: false
sse_encryption: false
http_config:
idle_conn_timeout: 1m30s
response_header_timeout: 0s
insecure_skip_verify: false
signature_version: v4
sse:
type: ""
kms_key_id: ""
kms_encryption_context: ""
swift:
auth_version: 0
auth_url: ""
username: ""
user_domain_name: ""
user_domain_id: ""
user_id: ""
password: ""
domain_id: ""
domain_name: ""
project_id: ""
project_name: ""
project_domain_id: ""
project_domain_name: ""
region_name: ""
container_name: ""
max_retries: 3
connect_timeout: 10s
request_timeout: 5s
local:
directory: /etc/loki/rules/
rule_path: /etc/loki/rules-temp/
alertmanager_url: http://localhost:9093
enable_alertmanager_discovery: false
alertmanager_refresh_interval: 1m0s
enable_alertmanager_v2: true
notification_queue_capacity: 10000
notification_timeout: 10s
alertmanager_client:
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
basic_auth_username: ""
basic_auth_password: ""
for_outage_tolerance: 1h0m0s
for_grace_period: 10m0s
resend_delay: 1m0s
enable_sharding: false
sharding_strategy: default
search_pending_for: 5m0s
ring:
kvstore:
store: inmemory
prefix: rulers/
consul:
host: localhost:8500
acl_token: ""
http_client_timeout: 20s
consistent_reads: false
watch_rate_limit: 1
watch_burst_size: 1
etcd:
endpoints: []
dial_timeout: 10s
max_retries: 10
tls_enabled: false
tls_cert_path: ""
tls_key_path: ""
tls_ca_path: ""
tls_server_name: ""
tls_insecure_skip_verify: false
multi:
primary: ""
secondary: ""
mirror_enabled: false
mirror_timeout: 2s
heartbeat_period: 5s
heartbeat_timeout: 1m0s
instance_id: logtest0
instance_interface_names:
- eth0
- en0
instance_port: 0
instance_addr: ""
num_tokens: 128
flush_period: 1m0s
enable_api: true
query_range:
split_queries_by_interval: 0s
split_queries_by_day: false
align_queries_with_step: false
results_cache:
cache:
enable_fifocache: false
default_validity: 0s
background:
writeback_goroutines: 10
writeback_buffer: 10000
memcached:
expiration: 0s
batch_size: 1024
parallelism: 100
memcached_client:
host: ""
service: memcached
addresses: ""
timeout: 100ms
max_idle_conns: 16
update_interval: 1m0s
consistent_hash: true
circuit_breaker_consecutive_failures: 10
circuit_breaker_timeout: 10s
circuit_breaker_interval: 10s
redis:
endpoint: ""
master_name: ""
timeout: 500ms
expiration: 0s
db: 0
pool_size: 0
password: ""
tls_enabled: false
tls_insecure_skip_verify: false
idle_timeout: 0s
max_connection_age: 0s
fifocache:
max_size_bytes: ""
max_size_items: 0
validity: 0s
size: 0
prefix: frontend.
compression: ""
cache_results: false
max_retries: 5
parallelise_shardable_queries: false
runtime_config:
period: 10s
file: ""
memberlist:
node_name: ""
randomize_node_name: true
stream_timeout: 0s
retransmit_factor: 0
pull_push_interval: 0s
gossip_interval: 0s
gossip_nodes: 0
gossip_to_dead_nodes_time: 0s
dead_node_reclaim_time: 0s
join_members: []
min_join_backoff: 1s
max_join_backoff: 1m0s
max_join_retries: 10
abort_if_cluster_join_fails: true
rejoin_interval: 0s
left_ingesters_timeout: 5m0s
leave_timeout: 5s
message_history_buffer_bytes: 0
bind_addr: []
bind_port: 7946
packet_dial_timeout: 5s
packet_write_timeout: 5s
tracing:
enabled: true
compactor:
working_directory: /tmp/loki/boltdb-shipper-compactor
shared_store: filesystem
compaction_interval: 2h0m0s