I am currently using Grafana Version v11.5.0 and grafana-on-call v1.14.1
I am trying to setup grafana oncall on premise and I have a grafana instance running running on a different VM and oncall running on docker
Below is the docker-compose.yml file used for oncall hobby install
x-environment: &oncall-environment
BASE_URL: $DOMAIN
SECRET_KEY: $SECRET_KEY
RABBITMQ_USERNAME: "username"
RABBITMQ_PASSWORD: ********
RABBITMQ_HOST: "rabbitmq"
RABBITMQ_PORT: "5672"
RABBITMQ_DEFAULT_VHOST: "/"
MYSQL_PASSWORD: ********
MYSQL_DB_NAME: oncall
MYSQL_USER: ${MYSQL_USER:-username}
MYSQL_HOST: ${MYSQL_HOST:-mysql}
MYSQL_PORT: 3306
REDIS_URI: redis://redis:6379/0
DJANGO_SETTINGS_MODULE: settings.hobby
CELERY_WORKER_QUEUE: "default,critical,long,slack,telegram,webhook,retry,celery"
CELERY_WORKER_CONCURRENCY: "1"
CELERY_WORKER_MAX_TASKS_PER_CHILD: "100"
CELERY_WORKER_SHUTDOWN_INTERVAL: "65m"
CELERY_WORKER_BEAT_ENABLED: "True"
GRAFANA_API_URL: http://grafana-domain.org:3000
services:
engine:
image: grafana/oncall
restart: always
ports:
- "8080:8080"
command: sh -c "uwsgi --ini uwsgi.ini"
environment: *oncall-environment
depends_on:
mysql:
condition: service_healthy
oncall_db_migration:
condition: service_completed_successfully
rabbitmq:
condition: service_healthy
redis:
condition: service_started
celery:
image: grafana/oncall
restart: always
command: sh -c "./celery_with_exporter.sh"
environment: *oncall-environment
depends_on:
mysql:
condition: service_healthy
oncall_db_migration:
condition: service_completed_successfully
rabbitmq:
condition: service_healthy
redis:
condition: service_started
oncall_db_migration:
image: grafana/oncall
command: python manage.py migrate --noinput
environment: *oncall-environment
depends_on:
mysql:
condition: service_healthy
rabbitmq:
condition: service_healthy
mysql:
image: mysql:8.0.32
command: >-
--default-authentication-plugin=mysql_native_password
--character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci
restart: always
expose:
- 3306
volumes:
- dbdata:/var/lib/mysql
environment:
MYSQL_ROOT_PASSWORD: $MYSQL_PASSWORD
MYSQL_DATABASE: oncall_hobby
deploy:
resources:
limits:
memory: 1000m
cpus: "0.5"
healthcheck:
test: "mysql -uroot -p$MYSQL_PASSWORD oncall_hobby -e 'select 1'"
timeout: 20s
retries: 10
redis:
image: redis:7.0.15
restart: always
expose:
- 6379
deploy:
resources:
limits:
memory: 100m
cpus: "0.1"
rabbitmq:
image: "rabbitmq:3.12.0-management"
restart: always
hostname: rabbitmq
volumes:
- rabbitmqdata:/var/lib/rabbitmq
environment:
RABBITMQ_DEFAULT_USER: "rabbitmq"
RABBITMQ_DEFAULT_PASS: $RABBITMQ_PASSWORD
RABBITMQ_DEFAULT_VHOST: "/"
deploy:
resources:
limits:
memory: 1000m
cpus: "0.5"
healthcheck:
test: rabbitmq-diagnostics -q ping
interval: 30s
timeout: 30s
retries: 3
mysql_to_create_grafana_db:
image: mysql:8.0.32
command: >-
bash -c "mysql -h ${MYSQL_HOST:-mysql} -uroot -p${MYSQL_PASSWORD:?err}
-e 'CREATE DATABASE IF NOT EXISTS grafana CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;'"
depends_on:
mysql:
condition: service_healthy
profiles:
- with_grafana
volumes:
dbdata:
rabbitmqdata:
configs:
grafana.ini:
content: |
[feature_toggles]
accessControlOnCall = false
the oncall setup seems to be working fine and the migration and evrything was running fine.
I am using a custom certificate on the grafana side of things. I am not sure if this is the issue is linked to the custom certificate.
below if the config from the grafana.ini file
[server]
cert_file = /etc/grafana/custom_cert.pem
cert_key = /etc/grafana/custom_cert.key
is the below error due to the oncall instance failing to make call to the grafana instance? or the other way around?
I have been working on this for weeks now, someone please help me identify the root cause.
Below are the logs of /var/log/grafana/grafana.log from the grafana VM
logger=plugin.grafana-oncall-app t=2025-02-XXT11:28:09.553170227 level=error msg="getting incident plugin settings" error="error making request: Get \"https://grafana-domain.org:3000/api/plugins/grafana-incident-app/settings\": tls: failed to verify certificate: x509: certificate signed by unknown authority, https://grafana-domain.org:3000/api/plugins/grafana-incident-app/settings"
logger=plugin.grafana-oncall-app t=2025-02-XXT11:28:09.562176745 level=error msg="getting labels plugin settings" error="error making request: Get \"https://grafana-domain.org:3000/api/plugins/grafana-labels-app/settings\": tls: failed to verify certificate: x509: certificate signed by unknown authority, https://grafana-domain.org:3000/api/plugins/grafana-labels-app/settings"
logger=plugin.grafana-oncall-app t=2025-02-XXT11:28:10.06298683 level=error msg="Error request to oncall" error="Get \"http://grafana-oncall-url:8080/api/internal/v1/health/\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"
logger=plugin.grafana-oncall-app t=2025-02-XXT11:28:10.064312739 level=error msg="Error checking OnCall API health" error="Get \"http://grafana-oncall-url:8080/api/internal/v1/health/\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"
logger=plugin.grafana-oncall-app t=2025-02-XXT11:28:10.071094503 level=error msg="Error validating oncall plugin settings" error="error making request: Get \"https://grafana-domain.org:3000/api/org\": tls: failed to verify certificate: x509: certificate signed by unknown authority"
logger=context userId=1 orgId=1 uname=admin t=2025-02-XXT11:28:10.07220013 level=error msg="Request Completed" method=GET path=/api/plugins/grafana-oncall-app/resources/plugin/status status=500 remote_addr=130.207.157.55 time_ms=535 duration=535.967447ms size=153 referer=https://grafana-domain.org:3000/a/grafana-oncall-app handler=/api/plugins/:pluginId/resources/* status_source=downstream