Loki HA Cluster with NFS storage

Hi,

I have Reverse proxy (Loadbalancer Nginx) → 2 nodes: 1-st node for WRITE , 2-nd for READ and shared NFS storage.
Promtail (client) connects to Nginx, Nginx redirected to 1-st node (write)

I have configured Loadbalancer with round-robin with 2 nodes where each has write/read mode. The errors were the same.

Grafana-loki dashboard shows data but in the loki server logs we have 3 types of errors:

  1. msg=“failed to flush” err="failed to flush chunks: store put chunk: timeout, num_chunks: 1
  2. error checkpointing series" err="write /loki/wal/00000020: stale NFS file handle
  3. msg=“error syncing local boltdb files with storage” err=“failed to sync index set for table index_19822: timeout”

Node for WRITE:
loki[354091]: level=error ts=2024-04-09T13:08:12.689218272Z caller=checkpoint.go:613 msg=“error checkpointing series” err=“write /loki/wal/00000020: stale NFS file handle”

level=error ts=2024-04-09T12:59:52.980518058Z caller=flush.go:143 org_id=fake msg=“failed to flush” err=“failed to flush chunks: store put chunk: timeout, num_chunks: 1, labels: {filename="/var/log/zabbix/zabbix_server.log", host="zabbix-01.x.x", job="zabbix_server"}”

Node for READ:
level=error ts=2024-04-09T13:30:01.905551292Z caller=table_manager.go:133 index-store=boltdb-shipper-2020-10-24 msg=“error syncing local boltdb files with storage” err=“failed to sync index set for table index_19822: timeout”

level=error ts=2024-04-09T12:59:52.980518058Z caller=flush.go:143 org_id=fake msg=“failed to flush” err=“failed to flush chunks: store put chunk: timeout, num_chunks: 1, labels: {filename="/var/log/zabbix/zabbix_server.log", host="zabbix-01.x.x", job="zabbix_server"}”

Loki config:

NFS

auth_enabled: false

server:
http_listen_port: 3100
http_server_read_timeout: 60s # allow longer time span queries
http_server_write_timeout: 60s # allow longer time span queries
grpc_server_max_recv_msg_size: 33554432 # 32MiB (int bytes), default 4MB
grpc_server_max_send_msg_size: 33554432 # 32MiB (int bytes), default 4MB
log_level: info

querier:
max_concurrent: 16 #It is recommended to put about twice the number of CPUs. The default is 10
common:
path_prefix: /loki
replication_factor: 1
#storage:

filesystem:

chunks_directory: /loki/chunks

rules_directory: /loki/rules

memberlist:
join_members:

  • grafana-loki-01.x.x:7946
  • grafana-loki-02.x.x:7946
    ingester:
    lifecycler:
    join_after: 10s
    observe_period: 5s
    ring:
    replication_factor: 3
    kvstore:
    store: memberlist
    final_sleep: 0s
    chunk_idle_period: 1m
    wal:
    enabled: true
    dir: /loki/wal
    max_chunk_age: 1m
    chunk_retain_period: 30s
    chunk_encoding: snappy
    chunk_target_size: 1.572864e+06
    chunk_block_size: 262144
    flush_op_timeout: 10s

schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h

storage_config:
boltdb_shipper:
shared_store: filesystem
active_index_directory: /loki/index
cache_location: /loki/boltdb-cache
filesystem:
directory: /loki/chunks

limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
ingestion_rate_mb: 20
ingestion_burst_size_mb: 30
per_stream_rate_limit: 5MB # <string|int> | default = “3MB”
per_stream_rate_limit_burst: 15MB # <string|int> | default = “15MB”

query_range:
cache_results: true
results_cache:
cache:
enable_fifocache: true
fifocache:
max_size_items: 1024
validity: 24h

ruler:
alertmanager_url: http://localhost:9093

ingester_client:
grpc_client_config:
# The maximum size in bytes the client can send.
# CLI flag: -.grpc-max-send-msg-size
max_send_msg_size: 33554432 # 32mb, default = 16777216]
max_recv_msg_size: 33554432

query_scheduler:
max_outstanding_requests_per_tenant: 2048
grpc_client_config:
# The maximum size in bytes the client can send.
# CLI flag: -.grpc-max-send-msg-size
max_send_msg_size: 33554432 # 32mb, default = 16777216]
max_recv_msg_size: 33554432