There is no error log in the write component(just warning), but there is in the other components.
When we lost 38,666 logs, we reduced 6 write pods to 4.
(One thing to note though, if the problem was when we reduced the WRITE PODs, we shouldn’t have had the same problem before that, and similarly, we have inconsistent log line counts at different times on different days).
From what I can see, it’s hard to see what’s going on other than the number of pods being written to and read from and the error logs being generated.
Could you look at the logs below to see what the problem is?
I’ve removed as many duplicates as possible and made sure to hide as much internal information as possible.
carnary log
“error running metric query test: expected only a single series result in the metric test query vector, instead received 2”,
“Loki returned an error code: 500, waiting 1.394788912s before next query.error querying loki: error response from server: empty ring”,
“error reading websocket, will retry in 10 seconds: websocket: close 1011 (internal server error): all ingesters closed the connection”,
“error reading websocket, will retry in 10 seconds: websocket: close 1011 (internal server error): too many unhealthy instances in the ring”,
“error reading websocket, will retry in 10 seconds: websocket: close 1006 (abnormal closure): unexpected EOF”
read log
“level=error caller=http.go:316 org_id=fake msg="Error from client" err="websocket: close 1006 (abnormal closure): unexpected EOF"”,
“level=error caller=scheduler_processor.go:158 org_id=fake msg="error notifying scheduler about finished query" err=EOF addr=xx.xx.169.172:9095”,
“level=error caller=scheduler_processor.go:106 msg="error processing requests from scheduler" err="rpc error: code = Canceled desc = context canceled" addr=xx.xx.169.172:9095”,
“level=error caller=retry.go:73 org_id=fake msg="error processing request" try=0 query="{container=\"loki\"} != \"\" !~ \"info\"" err="context canceled"”,
“level=error caller=tail.go:224 org_id=fake msg="Error receiving response from grpc tail client" err=EOF”,
“level=error caller=tail.go:224 org_id=fake msg="Error receiving response from grpc tail client" err="rpc error: code = Unknown desc = Ingester is stopping"”,
“level=error caller=tail.go:90 msg="Error reconnecting to disconnected ingesters" err="failed to connect with one or more ingester(s) during tailing: context canceled"”,
“level=warn caller=scheduler_processor.go:98 msg="error contacting scheduler" err="rpc error: code = Canceled desc = context canceled" addr=xx.xx.169.152:9095”,
“level=warn caller=pool.go:193 msg="removing frontend failing healthcheck" addr=xx.xx.128.217:9095 reason="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“caller=spanlogger.go:85 middleware=QueryShard.astMapperware org_id=fake org_id=fake level=warn msg="failed mapping AST" err="rpc error: code = Code(500) desc = empty ring\n" query="{stream=\"stdout\",pod=\"loki-canary-k249m\"}"”,
“level=error caller=retry.go:73 org_id=fake msg="error processing request" try=4 query="{stream=\"stdout\", pod=\"loki-canary-k249m\"}" err="rpc error: code = Code(500) desc = empty ring\n"”,
“level=error caller=http.go:347 org_id=fake msg="Error from iterator" err="all ingesters closed the connection"”,
“level=error caller=tail.go:140 msg="Error reconnecting to ingesters" err="failed to connect with one or more ingester(s) during tailing: empty ring"”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.136.82:9095 err="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.136.82:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.136.82:9095: connect: no route to host\""”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.136.82:9095: connect: no route to host\"" addr=xx.xx.136.82:9095”,
“level=error caller=frontend_scheduler_worker.go:248 msg="error sending requests to scheduler" err="scheduler is shutting down" addr=xx.xx.136.82:9095”,
“level=error caller=scheduler_processor.go:106 msg="error processing requests from scheduler" err="rpc error: code = Unknown desc = queue is stopped" addr=xx.xx.136.82:9095”,
“level=warn caller=pool.go:193 msg="removing frontend failing healthcheck" addr=xx.xx.138.78:9095 reason="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.173.74:9095 err="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.173.74:9095: connect: no route to host\"" addr=xx.xx.173.74:9095”,
“level=warn caller=scheduler_processor.go:98 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.173.74:9095: connect: no route to host\"" addr=xx.xx.173.74:9095”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.173.74:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.173.74:9095: connect: no route to host\""”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.173.74:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.173.74:9095: connect: connection refused\""”,
“level=error caller=frontend_scheduler_worker.go:248 msg="error sending requests to scheduler" err="scheduler is shutting down" addr=xx.xx.173.74:9095”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.173.74:9095: connect: connection refused\"" addr=xx.xx.173.74:9095”,
“level=warn caller=pool.go:193 msg="removing frontend failing healthcheck" addr=xx.xx.167.79:9095 reason="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“level=error caller=scheduler_processor.go:106 msg="error processing requests from scheduler" err="rpc error: code = Unknown desc = queue is stopped" addr=xx.xx.173.74:9095”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.139.105:9095: connect: no route to host\"" addr=xx.xx.139.105:9095”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.170.13:9095: connect: connection refused\"" addr=xx.xx.170.13:9095”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.143.63:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.143.63:9095: connect: connection refused\""”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.170.13:9095 err="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.139.105:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.139.105:9095: connect: connection refused\""”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.143.63:9095 err="rpc error: code = DeadlineExceeded desc = context deadline exceeded"”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.170.13:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.170.13:9095: connect: no route to host\""”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.170.13:9095: connect: no route to host\"" addr=xx.xx.170.13:9095”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Canceled desc = context canceled" addr=xx.xx.170.13:9095”,
“level=warn caller=scheduler_processor.go:78 msg="failed to notify querier shutdown to query-scheduler" address=xx.xx.143.63:9095 err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.143.63:9095: connect: no route to host\""”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.143.63:9095: connect: no route to host\"" addr=xx.xx.143.63:9095”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.143.63:9095: connect: connection refused\"" addr=xx.xx.143.63:9095”,
“level=error caller=frontend_scheduler_worker.go:237 msg="error contacting scheduler" err="rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp xx.xx.139.105:9095: connect: connection refused\"" addr=xx.xx.139.105:9095”,
“level=error caller=frontend_scheduler_worker.go:248 msg="error sending requests to scheduler" err="scheduler is shutting down" addr=xx.xx.143.63:9095”,
“level=error caller=frontend_scheduler_worker.go:248 msg="error sending requests to scheduler" err="scheduler is shutting down" addr=xx.xx.170.13:9095”,
“level=error caller=retry.go:73 org_id=fake msg="error processing request" try=0 query="{stream=\"stdout\", pod=\"loki-canary-lv2j8\"}" err="rpc error: code = Code(500) desc = failed to enqueue request"”,
“level=error caller=frontend_scheduler_worker.go:248 msg="error sending requests to scheduler" err="scheduler is shutting down" addr=xx.xx.139.105:9095”
backend log
Normally see a lot of logs saying that the index schema is not found. However, I only wrote one because it seems redundant…
“level=error caller=compactor.go:507 msg="skipping compaction since we can’t find schema for table" table=loki_index_19556”
write
Here I see the error message "POST /loki/api/v1/push (500)", but it’s important to note that my application log agent is fluentbit There are no logs that show the agent as fluentbit…
“level=warn caller=logging.go:86 traceID=0e69042c3c34cb77 orgID=fake msg="POST /loki/api/v1/push (500) 1.789064ms Response: \"rpc error: code = Unknown desc = Ingester is shutting down\\n\" ws: false; Connection: close; Content-Length: 6480; Content-Type: application/x-protobuf; User-Agent: promtail/2.8.2; "”,
“level=warn caller=logging.go:86 traceID=007a2d82f74c8f7d orgID=fake msg="POST /loki/api/v1/push (500) 1.786022ms Response: \"rpc error: code = Unknown desc = Ingester is shutting down\\n\" ws: false; Connection: close; Content-Length: 5971; Content-Type: application/x-protobuf; User-Agent: GrafanaAgent/; "”,
“level=warn caller=logging.go:86 traceID=3ff892d7dd53efb7 orgID=fake msg="POST /loki/api/v1/push (500) 1.860704ms Response: \"rpc error: code = Unknown desc = Ingester is shutting down\\n\" ws: false; Connection: close; Content-Length: 4379; Content-Type: application/x-protobuf; User-Agent: promtail/2.8.2; "”,
“level=warn caller=logging.go:86 traceID=3320abf78a9ce2a3 orgID=fake msg="POST /loki/api/v1/push (500) 1.698046ms Response: \"rpc error: code = Unknown desc = Ingester is shutting down\\n\" ws: false; Connection: close; Content-Length: 4300; Content-Type: application/x-protobuf; User-Agent: GrafanaAgent/; "”,
gateway log
“[error] 9#9: *57 connect() failed (111: Connection refused) while connecting to upstream, client: xx.xx.141.65, server: , request: "POST /loki/api/v1/push HTTP/1.1", upstream: "http://xx.xx.238.68:3100/loki/api/v1/push\”, host: "loki-gateway"“,
“[error] 10#10: *8314 connect() failed (111: Connection refused) while connecting to upstream, client: xx.xx.129.12, server: , request: "POST /loki/api/v1/push HTTP/1.1", upstream: "http://xx.xx.238.68:3100/loki/api/v1/push\”, host: "loki.domain.com"”,
“[error] 13#13: *8291 connect() failed (111: Connection refused) while connecting to upstream, client: xx.xx.164.246, server: , request: "POST /loki/api/v1/push HTTP/1.1", upstream: "http://xx.xx.238.68:3100/loki/api/v1/push\”, host: "loki-gateway.loki.svc.cluster.local""
client
“level=warn caller=client.go:419 component=client host=loki-gateway msg="error sending batch, will retry" status=502 tenant= error="server returned HTTP status 502 Bad Gateway (502): "”,
“level=warn caller=client.go:419 component=client host=loki-gateway msg="error sending batch, will retry" status=-1 tenant= error="Post \"http://loki-gateway/loki/api/v1/push\\\”: context deadline exceeded"“,
“level=warn caller=client.go:419 component=client host=loki-gateway msg="error sending batch, will retry" status=-1 tenant= error="Post \"http://loki-gateway/loki/api/v1/push\\\”: dial tcp xx.xx.27.64:80: connect: connection refused"”,
“level=warn caller=client.go:419 component=client host=loki-gateway msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): rpc error: code = Unknown desc = Ingester is shutting down"”
logs
“caller=client.go:419 level=warn component=logs logs_config=loki/loki component=client host=loki-gateway.loki.svc.cluster.local msg="error sending batch, will retry" status=502 tenant= error="server returned HTTP status 502 Bad Gateway (502): "”,
“caller=client.go:419 level=warn component=logs logs_config=loki/loki component=client host=loki-gateway.loki.svc.cluster.local msg="error sending batch, will retry" status=-1 tenant= error="Post \"http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push\\\”: context deadline exceeded"",
“caller=client.go:419 level=warn component=logs logs_config=loki/loki component=client host=loki-gateway.loki.svc.cluster.local msg="error sending batch, will retry" status=500 tenant= error="server returned HTTP status 500 Internal Server Error (500): rpc error: code = Unknown desc = Ingester is shutting down"”,
“caller=client.go:419 level=warn component=logs logs_config=loki/loki component=client host=loki-gateway.loki.svc.cluster.local msg="error sending batch, will retry" status=-1 tenant= error="Post \"http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push\\\”: dial tcp xx.xx.27.64:80: connect: connection refused"",