Alert stay in pending for ever even with a short Pending period

Hello,

I am trying to create alert rules to detect malfunctions in my routers. For this, I collect various parameters from all my routers and create a large table that consolidates them all. I then add a column named “STATE” which defines their status (0 for OK, 1 for WARNING, 2 for CRITICAL).

However, when I try to visualize the query results in the alert rules menu, the output does not resemble the actual results (as seen on a dashboard). It only detects the first column of the table.

To work around this, I created a “description” column that concatenates all the information into a single string and an “error” column set to 1 temporarily to test if the alert firing works correctly.

My alert condition is set to trigger when this “error” column is greater than 0. This should quickly trigger an alert for all routers.

Indeed, when I preview the rule in the editor, I see “firing”, but when I exit the editor menu, the alert remains stuck in “pending” even longer than the pending period.

Has anyone else encountered this issue or does anyone have suggestions on how to resolve this?

Thank you!

image

the query :

import "strings"
import "join"
 
routeur_CPU_max = from(bucket: "Telemetry")
  |> range(start: -30m)
  |> filter(fn: (r) => r["_measurement"] == "CPU")
  |> filter(fn: (r) => r["_field"] == "total_cpu_five_minute")
  |> group(columns: ["_measurement", "source", "node_name"])
  |> keep(columns: ["_value", "_measurement", "node_name", "source"])
  |> group(columns: ["_measurement", "source"])
  |> max(column: "_value")
  |> pivot(rowKey:["source"], columnKey: ["_measurement"], valueColumn: "_value")
  |> rename(columns: {"CPU": "CPU_max","source":"id"})
  |> group()
 
routeur_RAM_max = from(bucket: "Telemetry")
  |> range(start: -30m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-nto-misc-oper:memory-summary/nodes/node/summary")
  |> filter(fn: (r) => r["_field"] == "ram_memory")
  |> keep(columns: ["_time","_value", "source", "node_name"])
  |> group(columns: ["source", "node_name"])
  |>last()
  |>group()
  |>drop(columns:["_time"])
  |> rename(columns: {"source": "id","_value": "RAM_max"})
 
routeur_RAM_min = from(bucket: "Telemetry")
  |> range(start: -10m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-nto-misc-oper:memory-summary/nodes/node/summary")
  |> filter(fn: (r) => r["_field"] == "free_physical_memory")
  |> group(columns: ["_measurement", "source", "node_name"])
  |> keep(columns: ["_value", "_measurement", "node_name", "source"])
  |> group(columns: ["_measurement", "source"])
  |> min(column: "_value")
  |> pivot(rowKey:["source","node_name"], columnKey: ["_measurement"], valueColumn: "_value")
  |> rename(columns: {"Cisco-IOS-XR-nto-misc-oper:memory-summary/nodes/node/summary": "RAM_min", "source": "id"})
  |> group()
 
  routeur_RAM_percent = join.tables(
    left: routeur_RAM_max,
    right: routeur_RAM_min,
    on: (l, r) => l.id == r.id and l.node_name == r.node_name,
    as: (l, r) => ({r with RAM_max:l.RAM_max}),
    method: "full"
)
|> map(fn: (r) => ({r with RAM_max: float(v: r.RAM_max)}))
|> map(fn: (r) => ({r with RAM_min: float(v: r.RAM_min)}))
|> map(fn: (r) => ({r with RAM_min_percent: 100.0*((r.RAM_max - r.RAM_min)/r.RAM_max)}))
|> filter(fn: (r) => r.id != "")
 
CPU_max_temp = from(bucket: "Telemetry")
  |> range(start: -10m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-sysadmin-fretta-envmon-ui:environment/oper/temperatures/location/sensor_attributes")
  |> filter(fn: (r) => r["_field"] == "value")
  |> filter(fn: (r) => r["sensor"] =~ /CPU$/ or r["sensor"] == "0/0-CPU Inlet Local" or r["sensor"] == "0/0/1")
  |> filter(fn: (r) => not strings.containsStr(v: r["sensor"], substr: "V"))  
  |> group(columns:["source","sensor"])
  |> last()
  |> group(columns:["source"])
  |> max(column: "_value")
  |> group()
  |> keep(columns:["source","_value"])
  |> rename(columns: {"source":"id"})
  |> sort(columns: ["id"], desc: true)
  |> rename(columns: {"_value":"CPU_max_temp"})
 
disk_data = from(bucket: "Telemetry")
  |> range(start: -10m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-mediasvr-linux-oper:media-svr/nodes/node/partition")
  |> filter(fn: (r) => r["_field"] == "percent")
  |> filter(fn: (r) => r["name"] == "disk0:" or r["name"] == "harddisk:")
  |> group(columns:["source","name"])
  |> last()
  |> group()
  |> keep(columns:["source","name","_value"])
  |> map(fn: (r) => ({
      r with
      _value: int(v: strings.trimSuffix(v: r._value, suffix: "%"))
  }))
  |> pivot(rowKey:["source"], columnKey: ["name"], valueColumn: "_value")
  |> rename(columns: {"source":"id","disk0:":"disk0","harddisk:":"harddisk"})
 
PW_State_data = from(bucket: "Telemetry")
  |> range(start: -10m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-sysadmin-fretta-envmon-ui:environment/oper/power/location/pem_attributes")
  |> filter(fn: (r) => r["_field"] == "status/value")
  |> group(columns: ["location", "source"])
  |> last()
  |> group(columns: ["source"])
  |> keep(columns: ["_value", "location", "source"])
  |> reduce(
      fn: (r, accumulator) => ({
          all_ok: accumulator.all_ok and (r._value == "OK"),
          any_failed: accumulator.any_failed or (r._value != "OK"),
          source: r.source
      }),
      identity: {all_ok: true, any_failed: false, source: ""}
  )
  |> map(fn: (r) => ({
      _value: if r.all_ok then "OK" else "FAILED",
      source: r.source
  }))
  |> group()
  |> rename(columns: {"source":"id","_value":"PW_State"})
 
interface_up_percent = from(bucket: "Telemetry")
  |> range(start: -10m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-pfi-im-cmd-oper:interfaces/interface-briefs/interface-brief")
  |> filter(fn: (r) => r["_field"] == "line_state" or r["_field"] == "state")
  |> filter(fn: (r) => r["_value"] != "im-state-admin-down")
  |> filter(fn: (r) => strings.containsStr(v: r["interface_name"], substr: "Gig"))
  |> group(columns:["source","interface_name","_field"])
  |> last()
  |> group(columns:["source","interface_name"])
  |> pivot(rowKey: ["_time"], columnKey: ["_field"], valueColumn: "_value")
  |> map(fn: (r) => ({
      r with elapsed_time_edge: float(v: uint(v: now()) - uint(v: r._time)) / 1000000000.0
    }))
  |> map(fn: (r) => ({
      r with
      _field: "interface_state",
      _value: if r.line_state == "im-state-up" and r.state == "im-state-up" and r.elapsed_time_edge < 90 then "up" else "down"
    })
  )
  |> rename(columns: {_value: "interface_state"})
  |> drop(columns: ["line_state", "state", "_field","_time"])
  |> group(columns:["source"])
  |> reduce(
      identity: {up_count: 0, total_count: 0},
      fn: (r, accumulator) => ({
        up_count: accumulator.up_count + (if r.interface_state == "up" then 1 else 0),
        total_count: accumulator.total_count + 1
      })
    )
  |> map(fn: (r) => ({
      source: r.source,
      interface_up_percent: (float(v: r.up_count) / float(v: r.total_count)) * 100.0
    }))
  |> group()
  |> rename(columns: {"source":"id"})
 
routeur_time = from(bucket: "Telemetry")
  |> range(start: -10m)
  |> filter(fn: (r) => r["_measurement"] == "Cisco-IOS-XR-nto-misc-oper:memory-summary/nodes/node/summary" or r["_measurement"] == "CPU")
  |> filter(fn: (r) => r["_field"] == "free_physical_memory" or r["_field"] == "total_cpu_five_minute")
  |>group(columns:["source","_measurement","node_name"])
  |>last()
  |>group(columns:["source"])
  |> keep(columns: ["source","_time"])
  |> map(fn: (r) => ({
      r with
      heart_beat: float(v: uint(v: now()) - uint(v: r._time)) / 1000000000.0
    }))
  |> max(column: "heart_beat")
  |> group()
  |> drop(columns:["_time"])
  |> rename(columns: {"source":"id"})
 
 
 
agg2 = join.tables(
    left: routeur_RAM_percent,
    right: routeur_CPU_max,
    on: (l, r) => l.id == r.id,
    as: (l, r) => ({r with RAM_min_percent:l.RAM_min_percent}),
    method: "full"
)
 
agg3 = join.tables(
    left: CPU_max_temp,
    right: agg2,
    on: (l, r) => l.id == r.id,
    as: (l, r) => ({r with CPU_max_temp:l.CPU_max_temp}),
    method: "full"
)
 
agg4 = join.tables(
    left: disk_data,
    right: agg3,
    on: (l, r) => l.id == r.id,
    as: (l, r) => ({r with disk0:l.disk0,harddisk:l.harddisk}),
    method: "full"
)
 
agg5 = join.tables(
    left: PW_State_data,
    right: agg4,
    on: (l, r) => l.id == r.id,
    as: (l, r) => ({r with PW_State:l.PW_State}),
    method: "full"
)
 
agg6 = join.tables(
    left: interface_up_percent,
    right: agg5,
    on: (l, r) => l.id == r.id,
    as: (l, r) => ({r with interface_up_percent:l.interface_up_percent}),
    method: "full"
)
 
agg7 = join.tables(
    left: routeur_time,
    right: agg6,
    on: (l, r) => l.id == r.id,
    as: (l, r) => ({r with heart_beat:l.heart_beat}),
    method: "full"
)
 
agg7
|> map(fn: (r) => ({
    r with
    STATE: -1,
}))
|> map(fn: (r) => ({
    r with STATE: if exists r.heart_beat or exists r.CPU_max or exists r.CPU_max_temp or exists r.RAM_min_percent or exists r.disk0 or exists r.harddisk or exists r.PW_State or exists r.interface_up_percent then 0 else r.STATE
}))
|> map(fn: (r) => ({
    r with STATE: if (r.heart_beat > 90) or (r.CPU_max > 35) or (r.CPU_max_temp > 50) or (r.RAM_min_percent > 50) or (r.disk0 > 50) or (r.harddisk > 50) or (r.PW_State != "OK") then 1 else r.STATE
}))
|> map(fn: (r) => ({
    r with STATE: if (r.heart_beat > 300) or (r.CPU_max > 70) or (r.RAM_min_percent > 75) or (r.disk0 > 80) or (r.harddisk > 80) or (r.interface_up_percent < 100) then 2 else r.STATE
}))
|> sort(columns: ["id"], desc: true)
|> map(fn: (r) => ({
    description: "(ID : " + string(v: r.id) +
                 " | STATE : " + string(v: r.STATE) +
                 " | heart_beat : " + string(v: r.heart_beat) +
                 " | CPU_max : " + string(v: r.CPU_max) +
                 " | CPU_max_temp : " + string(v: r.CPU_max_temp) +
                 " | RAM_min_percent : " + string(v: r.RAM_min_percent) +
                 " | disk0 : " + string(v: r.disk0) +
                 " | harddisk : " + string(v: r.harddisk) +
                 " | interface_up_percent : " + string(v: r.interface_up_percent) +
                 " | PW_State : " + string(v: r.PW_State),
}))
|> map(fn: (r) => ({
    description: r.description,
    error: 1
}))
|> filter(fn: (r) => r["description"] != "")
|> yield(name: "nodes")