I’m facing an issue where If I’m adding more querier servers to existing querier fleet it doesn’t result in better performance. Currently below is what performance I’m getting on 20 queriers.
Data source stats
| Summary: bytes processed per second |
4.85 GB/s |
| Summary: lines processed per second |
5006492 |
| undefined |
-— |
| Summary: total bytes processed |
300 GB |
| undefined |
-— |
| Summary: exec time |
53.76 s |
| undefined |
-— |
| Ingester: total reached |
13 |
| undefined |
-— |
| Ingester: total chunks matched |
16 |
| undefined |
-— |
| Ingester: total batches |
13 |
| undefined |
-— |
| Ingester: total lines sent |
0 |
| undefined |
-— |
| Ingester: head chunk bytes |
0 B |
| undefined |
-— |
| Ingester: head chunk lines |
0 |
| undefined |
-— |
| Ingester: decompressed bytes |
0 B |
| undefined |
-— |
| Ingester: decompressed lines |
0 |
| undefined |
-— |
| Ingester: compressed bytes |
0 B |
| undefined |
-— |
| Ingester: total duplicates |
0 |
| undefined |
-— |
Now after seeing these stats, I thought of adding 20 more queriers to bring down the executing time of a query. But it didn’t result in much performance gain It went from ~53 secs to ~45-50 secs.
Below are current configuration.
Main config -
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9095
http_server_read_timeout: 5m
http_server_write_timeout: 5m
grpc_server_max_recv_msg_size: 16777216
grpc_server_max_send_msg_size: 16777216
common:
compactor_address: http://backend:3100
storage:
s3:
bucketnames: loki-data
region: ap-south-1
replication_factor: 1
ring:
instance_addr: ${INSTANCE_ADDR}
instance_port: ${INSTANCE_PORT}
kvstore:
store: memberlist
memberlist:
join_members:
- read:7946
- 172.1.1.1:7946
- 172.1.1.2:7946
- 172.1.1.3:7946
- 172.1.1.4:7946
- 172.1.1.5:7946
advertise_addr: ${INSTANCE_ADDR}
advertise_port: ${GOSSIP_PORT}
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: s3
schema: v13
index:
path_prefix: index/
prefix: index_
period: 24h
storage_config:
tsdb_shipper:
active_index_directory: /loki/tsdb-index
cache_location: /loki/tsdb-cache
# chunk_store_config:
# chunk_cache_config:
# memcached:
# batch_size: 256
# parallelism: 10
# expiration: 24h
# memcached_client:
# host: memcached
# service: memcached
# query_range:
# results_cache:
# cache:
# memcached_client:
# host: memcached
# service: memcached
compactor:
working_directory: /loki/compactor
compaction_interval: 10m
max_compaction_parallelism: 4
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: s3
ruler:
alertmanager_url: http://localhost:9093
limits_config:
allow_structured_metadata: true
retention_period: 365d
split_queries_by_interval: 15m
max_query_parallelism: 16
ingester:
wal:
enabled: true
dir: /loki/wal
max_chunk_age: 2m
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 1
Querier Config -
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9095
grpc_server_max_recv_msg_size: 104857600
grpc_server_max_send_msg_size: 104857600
common:
compactor_address: http://172.*.*.*:3105
storage:
s3:
bucketnames: loki-data
region: ap-south-1
replication_factor: 1
ring:
kvstore:
store: memberlist
memberlist:
join_members:
- 172.*.*.*:7945
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: s3
schema: v13
index:
path_prefix: index/
prefix: index_
period: 24h
storage_config:
tsdb_shipper:
active_index_directory: /loki/tsdb-index
cache_location: /loki/tsdb-cache
query_range:
frontend_worker:
frontend_address: 172.*.*.*:9098
querier:
engine:
max_look_back_period: 0s
limits_config:
allow_structured_metadata: true
retention_period: 168h
split_queries_by_interval: 15m
max_query_parallelism: 16
Hi @tonyswumac,
Thanks for your reply.
- Below services are running on a single server.
Main Server:
networks:
loki:
services:
read:
image: grafana/loki:latest
command: "-config.file=/etc/loki/loki-config-main.yaml -target=read -config.expand-env=true"
env_file:
- .env
ports:
- "3101:3100"
- "7945:7946"
- "9095:9095"
environment:
- INSTANCE_ADDR=172.*.*.*
- INSTANCE_PORT=9095
- GOSSIP_PORT=7945
volumes:
- ./loki-config-main.yaml:/etc/loki/loki-config-main.yaml
depends_on:
- minio
healthcheck:
test:
[
"CMD-SHELL",
"wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1",
]
interval: 10s
timeout: 5s
retries: 5
networks: &loki-dns
loki:
aliases:
- loki
write:
image: grafana/loki:latest
command: "-config.file=/etc/loki/loki-config-main.yaml -target=write -config.expand-env=true"
env_file:
- .env
ports:
- "3102:3100"
- "7947:7946"
- "9096:9095"
environment:
- INSTANCE_ADDR=172.*.*.*
- INSTANCE_PORT=9096
- GOSSIP_PORT=7947
volumes:
- ./loki-config-main.yaml:/etc/loki/loki-config-main.yaml
- ./.data/loki-wal:/loki/wal
healthcheck:
test:
[
"CMD-SHELL",
"wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1",
]
interval: 10s
timeout: 5s
retries: 5
depends_on:
- minio
stop_grace_period: 30s
networks:
<<: *loki-dns
alloy:
image: grafana/alloy:latest
volumes:
- ./alloy-local-config.yaml:/etc/alloy/config.alloy:ro
- /var/run/docker.sock:/var/run/docker.sock
command: run --server.http.listen-addr=0.0.0.0:12345 --storage.path=/var/lib/alloy/data /etc/alloy/config.alloy
ports:
- "12345:12345"
- "4319:4317"
- "4320:4318"
depends_on:
- gateway
stop_grace_period: 30s
networks:
- loki
minio:
image: minio/minio
entrypoint:
- sh
- -euc
- |
mkdir -p /data/loki-data && \
mkdir -p /data/loki-ruler && \
minio server /data
environment:
- MINIO_ROOT_USER=loki
- MINIO_ROOT_PASSWORD=supersecret
- MINIO_PROMETHEUS_AUTH_TYPE=public
- MINIO_UPDATE=off
ports:
- "9000:9000"
volumes:
- ./.data/minio:/data
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 15s
timeout: 20s
retries: 5
networks:
- loki
grafana:
image: grafana/grafana:latest
environment:
volumes:
- ./grafana-provisioning/datasources:/etc/grafana/provisioning/datasources
- ./grafana-provisioning/dashboards:/etc/grafana/provisioning/dashboards
- grafana-data:/var/lib/grafana
ports:
- "3000:3000"
depends_on:
- gateway
- tempo
- prometheus
networks:
- loki
backend:
image: grafana/loki:latest
volumes:
- ./loki-config-main.yaml:/etc/loki/loki-config-main.yaml
- ./.data/loki-compactor:/loki/compactor
env_file:
- .env
ports:
- "3105:3100"
- "7948:7946"
- "9097:9095"
environment:
- INSTANCE_ADDR=172.*.*.*
- INSTANCE_PORT=9097
- GOSSIP_PORT=7948
command: "-config.file=/etc/loki/loki-config-main.yaml -target=backend -legacy-read-mode=false -config.expand-env=true"
stop_grace_period: 30s
depends_on:
minio:
condition: service_healthy
networks:
- loki
deploy:
resources:
limits:
cpus: '2.0'
memory: '4g'
gateway:
image: nginx:latest
depends_on:
- read
- write
- backend
entrypoint:
- sh
- -euc
- |
cat <<EOF > /etc/nginx/nginx.conf
user nginx;
worker_processes 5;
events {
worker_connections 1000;
}
http {
resolver 127.0.0.11;
client_max_body_size 10m;
server {
listen 3100;
location = / {
return 200 'OK';
auth_basic off;
}
location = /loki/api/v1/push {
proxy_pass http://write:3100\$$request_uri;
}
location / {
proxy_pass http://query-frontend:3100;
}
}
}
EOF
/docker-entrypoint.sh nginx -g "daemon off;"
ports:
- "3100:3100"
networks:
- loki
flog:
image: mingrammer/flog
command: -f json -d 200ms -l
networks:
- loki
tempo:
image: grafana/tempo:latest
command: ["-config.file=/etc/tempo.yaml"]
volumes:
- ./tempo-config.yaml:/etc/tempo.yaml
- ./.data/tempo:/tmp/tempo
ports:
- "3200:3200"
- "4317:4317"
- "4318:4318"
networks:
- loki
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./.data/prometheus:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.retention.time=7d"
- "--web.listen-address=:9091"
- "--web.enable-remote-write-receiver"
ports:
- "9091:9091"
networks:
- loki
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
networks:
- loki
pyroscope:
image: grafana/pyroscope:latest
command: ["-config.file=/etc/pyroscope/pyroscope.yaml"]
ports:
- "4040:4040"
volumes:
- ./pyroscope.yaml:/etc/pyroscope/pyroscope.yaml
- ./.data/pyroscope:/var/lib/pyroscope
networks:
- loki
# query-scheduler:
# image: grafana/loki:latest
# command: "-config.file=/etc/loki/loki-config-main.yaml -target=query-scheduler -config.expand-env=true"
# volumes:
# - ./loki-config-main.yaml:/etc/loki/loki-config-main.yaml
# ports:
# - "9098:9095"
# networks:
# - loki
query-frontend:
image: grafana/loki:latest
command: "-config.file=/etc/loki/loki-config-query.yaml -target=query-frontend -memberlist.advertise-addr=${PUBLIC_IP} -log.level=debug -config.expand-env=true"
ports:
- "3103:3100"
- "9098:9095"
networks:
- loki
env_file:
- .env
volumes:
- ./loki-config-query.yaml:/etc/loki/loki-config-query.yaml
# memcached:
# image: memcached:latest
# command: -m 1024 -I 2m
# ports:
# - "11211:11211"
# networks:
# - loki
volumes:
grafana-data:```
**Querier Servers:**
```version: '3.8'
networks:
loki:
services:
querier:
image: grafana/loki:latest
entrypoint:
- sh
- -c
- |
echo "Waiting 10 seconds for query-frontend to be ready..."
sleep 10
echo "Starting querier..."
exec /usr/bin/loki -config.file=/etc/loki/loki-config-query.yaml -target=querier -config.expand-env=true -memberlist.advertise-addr=${PUBLIC_IP}
volumes:
- ./loki-config-query.yaml:/etc/loki/loki-config-query.yaml
networks:
- loki
env_file:
- .env
ports:
- "7946:7946"
- "3101:3100"
deploy:
replicas: 1
resources:
limits:
cpus: '2.0'
memory: '1.8g'
I’ll try your recommendation on config changes for split_queries_by_interval and max_query_parallelism and update here.
Thanks!