We are seeing a constant issue where one read node is being favored in our simple scalable deployment. If we restart the cluster another node will take it’s place. The other nodes are being used but it significantly favors only one read node. Write path is unaffected.
Configuration is as follows:
Deployment mode=simple scalable
Loki version=2.9.4
Read nodes=3
Write nodes=3
Both read and write paths are behind an Nginx reverse-proxy for load-balancing.
Please help us to determine how to get our read path properly balanced…
Loki Configuration:
loki_version: “2.9.4”
loki_http_port: 8443
loki_grpc_port: 9443
loki_system_user: loki
loki_system_group: loki
loki_config_dir: /etc/loki
loki_storage_dir: /loki
loki_auth_enabled: true
_max_tenant_throughput_mb: 40
_max_tenant_throughput_burst_mb: 60
_max_query_timeout: 600
loki_config:
common:
replication_factor: 3
ring:
kvstore:
store: memberlist
heartbeat_timeout: 10m
storage:
s3:
bucketnames: loki
endpoint: object-test.ceph
region: default
access_key_id: OMITTED
secret_access_key: OMITTED
insecure: false
s3forcepathstyle: true
http_config:
insecure_skip_verify: true
server:
log_level: info
http_listen_port: “8443”
http_tls_config:
cert_file: “{{ loki_config_dir }}/ssl/cert.crt”
key_file: “{{ loki_config_dir }}/ssl/cert.key”
http_server_read_timeout: “{{ _max_query_timeout + 10 }}s”
http_server_write_timeout: “{{ _max_query_timeout + 10 }}s”
grpc_listen_port: "{{ loki_grpc_port }}"
grpc_server_max_recv_msg_size: 104857600
grpc_server_max_send_msg_size: 104857600
grpc_server_max_concurrent_streams: 1000
ingester:
chunk_idle_period: 1h
max_chunk_age: 2h
flush_check_period: 10s
wal:
replay_memory_ceiling: “{{ (ansible_memtotal_mb * 0.75) | int }}MB”
querier:
multi_tenant_queries_enabled: true
max_concurrent: 3
memberlist:
abort_if_cluster_join_fails: false
bind_port: 7946
join_members: “{{ groups[‘loki’] }}”
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
rejoin_interval: 1m
schema_config:
configs:
- from: 2020-05-15
store: boltdb-shipper
object_store: s3
schema: v11
index:
prefix: index_
period: 24h
- from: "2023-01-30"
store: tsdb
object_store: s3
schema: v12
index:
prefix: index_tsdb_
period: 24h
storage_config:
hedging:
at: “250ms”
max_per_second: 20
up_to: 3
boltdb_shipper:
active_index_directory: "storage/boltdb-shipper-active"
cache_location: "storage/boltdb-shipper-cache"
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: s3
tsdb_shipper:
active_index_directory: "storage/tsdb-shipper-active"
cache_location: "storage/tsdb-shipper-cache"
shared_store: s3
frontend:
log_queries_longer_than: 15s
compress_responses: true
query_range:
align_queries_with_step: true
max_retries: 5
cache_results: true
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 2048
ttl: 1h
query_scheduler:
use_scheduler_ring: true
scheduler_ring:
kvstore:
store: memberlist
limits_config:
enforce_metric_name: false
ingestion_rate_mb: “{{ _max_tenant_throughput_mb }}”
ingestion_burst_size_mb: “{{ _max_tenant_throughput_burst_mb }}”
stream
per_stream_rate_limit: “{{ _max_tenant_throughput_mb }}MB”
per_stream_rate_limit_burst: “{{ _max_tenant_throughput_burst_mb }}MB”
max_entries_limit_per_query: 100000
max_global_streams_per_user: 20000
retention_period: 2w
query_timeout: "{{ _max_query_timeout }}s"
max_cache_freshness_per_query: "10m"
split_queries_by_interval: 15m
reject_old_samples: true
max_query_series: 10000
max_query_parallelism: 32
compactor:
working_directory: “storeage/compactor”
shared_store: s3
compaction_interval: 1m
retention_enabled: true