Hi
I am using Loki distributed v2.8.2, deployed using Helm.
After adding TSDB settings, it worked fine until queriers were restarted.
However, when queriers were restarted for some reason (e.g. Pod OOMKilled), they failed to start.
Let’s say I added settings to switch to use TSDB from 2023/06/07 on 06/06.
At this time, queriers were restarted without any issue.
However, when queriers were restarted on 06/07 after switching to use TSDB, queriers failed to start.
The error I received from queriers is below:
level=warn ts=2023-05-10T01:15:06.477889636Z caller=loki.go:286 msg="per-tenant timeout not configured, using default engine timeout (\"5m0s\"). This behavior will change in the next major to always use the default per-tenant timeout (\"5m\")."
level=info ts=2023-05-10T01:15:06.479054893Z caller=main.go:108 msg="Starting Loki" version="(version=2.8.2, branch=HEAD, revision=9f809eda7)"
level=info ts=2023-05-10T01:15:06.479582994Z caller=server.go:323 http=[::]:3100 grpc=[::]:9095 msg="server listening on addresses"
level=info ts=2023-05-10T01:15:06.48050178Z caller=memberlist_client.go:437 msg="Using memberlist cluster label and node name" cluster_label= node=multi-tenant-loki-distributed-querier-2-6edf9ae2
level=warn ts=2023-05-10T01:15:06.481237289Z caller=experimental.go:20 msg="experimental feature in use" feature="Redis cache - store.index-cache-read.redis"
level=info ts=2023-05-10T01:15:06.481455427Z caller=memberlist_client.go:543 msg="memberlist fast-join starting" nodes_found=1 to_join=4
level=warn ts=2023-05-10T01:15:06.482303259Z caller=experimental.go:20 msg="experimental feature in use" feature="Redis cache - chunksredis"
level=info ts=2023-05-10T01:15:06.483451532Z caller=table_manager.go:404 msg="loading local table loki_index_19478"
ts=2023-05-10T01:15:06.48355709Z caller=spanlogger.go:85 level=info msg="building index list cache"
level=info ts=2023-05-10T01:15:06.561697692Z caller=memberlist_client.go:563 msg="memberlist fast-join finished" joined_nodes=23 elapsed_time=80.241192ms
level=info ts=2023-05-10T01:15:06.561732372Z caller=memberlist_client.go:576 msg="joining memberlist cluster" join_members=multi-tenant-loki-distributed-memberlist
level=info ts=2023-05-10T01:15:06.641486839Z caller=memberlist_client.go:595 msg="joining memberlist cluster succeeded" reached_nodes=23 elapsed_time=79.757319ms
ts=2023-05-10T01:15:13.686923728Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.16.71:7946"
ts=2023-05-10T01:15:15.290610097Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.16.208:7946"
ts=2023-05-10T01:15:15.692413217Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.27.185:7946"
ts=2023-05-10T01:15:15.694482606Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.26.125:7946"
ts=2023-05-10T01:15:15.69611328Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.31.192:7946"
ts=2023-05-10T01:15:17.295083584Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.19.166:7946"
ts=2023-05-10T01:15:17.296178116Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.26.125:7946"
ts=2023-05-10T01:15:17.298064548Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.25.30:7946"
ts=2023-05-10T01:15:27.981918391Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.26.125:7946"
ts=2023-05-10T01:15:29.984221814Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.16.71:7946"
ts=2023-05-10T01:15:29.986048194Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.24.110:7946"
ts=2023-05-10T01:15:29.986101366Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.23.42:7946"
ts=2023-05-10T01:15:34.845853131Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.25.181:7946"
ts=2023-05-10T01:15:36.848547471Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.16.71:7946"
ts=2023-05-10T01:15:36.850664503Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.31.192:7946"
ts=2023-05-10T01:15:36.850788011Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.29.75:7946"
ts=2023-05-10T01:15:40.258021577Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.25.62:7946"
ts=2023-05-10T01:15:40.478519954Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.21.20:7946"
ts=2023-05-10T01:15:42.260781968Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.29.110:7946"
ts=2023-05-10T01:15:42.260980241Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.25.181:7946"
ts=2023-05-10T01:15:42.26162014Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.26.125:7946"
ts=2023-05-10T01:15:42.483962061Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.29.75:7946"
ts=2023-05-10T01:15:42.486228515Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.24.128:7946"
ts=2023-05-10T01:15:42.487477726Z caller=memberlist_logger.go:74 level=warn msg="Got ping for unexpected node 'multi-tenant-loki-distributed-querier-2-df6da675' from=100.116.25.30:7946"
ts=2023-05-10T01:15:46.970494584Z caller=memberlist_logger.go:74 level=info msg="Marking multi-tenant-loki-distributed-querier-2-df6da675 as failed, suspect timeout reached (2 peer confirmations)"
Here is my values.yml
ingester:
replicas: 10
maxUnavailable: 4
persistence:
enabled: true
size: 30Gi
storageClass: gp2
resources:
requests:
cpu: 1000m
memory: 4000Mi
limits:
cpu: 1000m
memory: 4000Mi
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: In
values:
- ingester
- key: app.kubernetes.io/instance
operator: In
values:
- multi-tenant
- key: app.kubernetes.io/name
operator: In
values:
- loki-distributed
topologyKey: failure-domain.beta.kubernetes.io/zone
distributor:
replicas: 6
maxUnavailable: 2
resources:
requests:
cpu: 500m
memory: 750Mi
limits:
cpu: 500m
memory: 750Mi
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: In
values:
- distributor
- key: app.kubernetes.io/instance
operator: In
values:
- multi-tenant
- key: app.kubernetes.io/name
operator: In
values:
- loki-distributed
topologyKey: failure-domain.beta.kubernetes.io/zone
querier:
replicas: 10
maxUnavailable: 4
persistence:
enabled: true
size: 30Gi
storageClass: gp2
resources:
requests:
cpu: 1500m
memory: 2000Mi
limits:
cpu: 1500m
memory: 2000Mi
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: In
values:
- querier
- key: app.kubernetes.io/instance
operator: In
values:
- multi-tenant
- key: app.kubernetes.io/name
operator: In
values:
- loki-distributed
topologyKey: failure-domain.beta.kubernetes.io/zone
queryFrontend:
replicas: 2
maxUnavailable: 1
resources:
requests:
cpu: 250m
memory: 500Mi
limits:
cpu: 250m
memory: 500Mi
affinity: |
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: In
values:
- queryFrontend
- key: app.kubernetes.io/instance
operator: In
values:
- multi-tenant
- key: app.kubernetes.io/name
operator: In
values:
- loki-distributed
topologyKey: failure-domain.beta.kubernetes.io/zone
serviceAccount:
create: false
compactor:
enabled: true
resources:
requests:
cpu: 500m
memory: 750Mi
limits:
cpu: 500m
memory: 750Mi
gateway:
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 100m
memory: 100Mi
service:
type: NodePort
nodePort: 32000
image:
registry: xxxx.dkr.ecr.ap-northeast-1.amazonaws.com
repository: isop-ecr
tag: nginx-1.19-alpine
loki:
image:
registry: xxxx.dkr.ecr.ap-northeast-1.amazonaws.com
repository: isop-ecr
tag: loki-2.8.2
config: |
common:
compactor_address: {{ include "loki.compactorFullname" . }}:3100
# For multi tenant mode you should set true to auth_enabled
auth_enabled: true
analytics:
reporting_enabled: false
server:
log_level: info
# Must be set to 3100
http_listen_port: 3100
distributor:
ring:
kvstore:
store: memberlist
ingester:
# Disable chunk transfer which is not possible with statefulsets
# and unnecessary for boltdb-shipper
max_transfer_retries: 0
chunk_idle_period: 1h
chunk_target_size: 1572864
max_chunk_age: 2h
lifecycler:
join_after: 0s
ring:
kvstore:
store: memberlist
replication_factor: 6
wal:
dir: /var/loki/wal
memberlist:
join_members:
- {{ include "loki.fullname" . }}-memberlist
limits_config:
ingestion_rate_mb: 20
ingestion_burst_size_mb: 40
max_cache_freshness_per_query: 10m
max_global_streams_per_user: 0 ## 0 means disable
chunk_store_config:
max_look_back_period: 2232h
chunk_cache_config:
redis:
endpoint: redis-for-loki.piokv0.ng.0001.apne1.cache.amazonaws.com:6379
expiration: 24h
schema_config:
configs:
- from: 2023-01-20
store: boltdb-shipper
object_store: aws
schema: v12
index:
prefix: loki_index_
period: 24h
## Add 2023/05/08 for TSDB index
- from: 2023-05-09
store: tsdb
object_store: aws
schema: v12
index:
prefix: loki_tsdb_index_
period: 24h
storage_config:
aws:
s3: s3://ap-northeast-1/loki-bucket
s3forcepathstyle: true
boltdb_shipper:
active_index_directory: /var/loki/index
shared_store: s3
cache_location: /var/loki/cache
index_queries_cache_config:
redis:
endpoint: redis-for-loki.piokv0.ng.0001.apne1.cache.amazonaws.com:6379
expiration: 24h
## Add 2023/05/08 for TSDB index
tsdb_shipper:
active_index_directory: /var/loki/tsdb-index
shared_store: s3
cache_location: /var/loki/tsdb-cache
query_range:
# make queries more cache-able by aligning them with their step intervals
align_queries_with_step: true
max_retries: 5
cache_results: true
results_cache:
cache:
redis:
endpoint: redis-for-loki.piokv0.ng.0001.apne1.cache.amazonaws.com:6379
expiration: 72h
querier:
## Add 2023/05/08 for TSDB index
max_concurrent: 16
frontend_worker:
frontend_address: {{ include "loki.queryFrontendFullname" . }}:9095
frontend:
log_queries_longer_than: 5s
compress_responses: true
tail_proxy_url: http://{{ include "loki.querierFullname" . }}:3100
compactor:
shared_store: s3