Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ on:
env:
POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }}
POSTGRES_USER: ${{ secrets.POSTGRES_USER }}
PGADMIN_USER=: ${{ secrets.PGADMIN_USER }}
PGADMIN_PASSWORD: ${{ secrets.PGADMIN_PASSWORD }}
PGADMIN_USER: ${{ secrets.PGADMIN_USER }}
PGADMIN_PASSWORD: ${{ secrets.PGADMIN_PASSWORD }}
GRAFANA_USER: ${{ secrets.GRAFANA_USERNAME }}
GRAFANA_PASSWORD: ${{ secrets.GRAFANA_PASSWORD }}
NGINX_CONFIG_PATH: ./nginx.conf
CERTBOT_COMMAND: "/bin/sh -c 'certbot certonly --webroot --webroot-path /var/www/certbot/ --non-interactive -d mastapp.site -d www.mastapp.site --agree-tos --register-unsafely-without-email';'while :; do certbot renew; sleep 12h; done'"
CERTBOT_COMMAND: "/bin/sh -c 'certbot certonly --webroot --webroot-path /var/www/certbot/ --non-interactive -d mastapp.site -d www.mastapp.site --agree-tos --register-unsafely-without-email; while :; do certbot renew; sleep 12h; done'"

jobs:
remote_deployment:
Expand All @@ -26,7 +28,7 @@ jobs:
username: ${{ secrets.REMOTE_USERNAME }}
key: ${{ secrets.REMOTE_PRIV_KEY }}
port: ${{ secrets.REMOTE_PORT }}
envs: POSTGRES_PASSWORD,POSTGRES_USER,PGADMIN_USER,PGADMIN_PASSWORD,NGINX_CONFIG_PATH,CERTBOT_COMMAND
envs: POSTGRES_PASSWORD,POSTGRES_USER,PGADMIN_USER,PGADMIN_PASSWORD,NGINX_CONFIG_PATH,CERTBOT_COMMAND,GRAFANA_USER,GRAFANA_PASSWORD
script: |
cd /srv/fair-mast/
git pull
Expand Down
4 changes: 3 additions & 1 deletion dev/docker/.env.dev
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ POSTGRES_USER=root
POSTGRES_PASSWORD=root
PGADMIN_USER=admin@admin.com
PGADMIN_PASSWORD=root
GRAFANA_USER=root
GRAFANA_PASSWORD=root

#The top nginx config is for a local build (generally using self signed certificates if for development)
#The bottom nginx config path is for the fully networked project
Expand All @@ -13,4 +15,4 @@ NGINX_CONFIG_PATH=./nginx-test.conf
#Runs a test certification, here the certificates are not saved and doesn't count towards the weekly limit
CERTBOT_COMMAND="/bin/sh -c 'certbot certonly --webroot --webroot-path /var/www/certbot/ --non-interactive -d mastapp.site -d www.mastapp.site --agree-tos --register-unsafely-without-email --dry-run'"
#Run a real certification then attemts auto renewal every 12 hours
#CERTBOT_COMMAND: "/bin/sh -c 'certbot certonly --webroot --webroot-path /var/www/certbot/ --non-interactive -d mastapp.site -d www.mastapp.site --agree-tos --register-unsafely-without-email';'while :; do certbot renew; sleep 12h; done'"
#CERTBOT_COMMAND: "/bin/sh -c 'certbot certonly --webroot --webroot-path /var/www/certbot/ --non-interactive -d mastapp.site -d www.mastapp.site --agree-tos --register-unsafely-without-email; while :; do certbot renew; sleep 12h; done'"
58 changes: 58 additions & 0 deletions dev/docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,64 @@ services:
networks:
- dbnetwork

# Service for hardware monitoring exporter
# This container runs a node_exporter instance used to export hardware metrics
node_exporter:
image: quay.io/prometheus/node-exporter:latest
container_name: node_exporter
volumes:
- '/:/host:ro,rslave'
networks:
- dbnetwork
ports:
- 9100:9100

#Service for monitoring docker containers
cadvisor:
image: ghcr.io/google/cadvisor:v0.53.0
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk:/dev/disk/:ro
networks:
- dbnetwork
ports:
- 8082:8080



# Service for metric aggregator
# This container runs a prometheus instance for collating all of the exposed metrics and transfering them to grafana
prometheus:
image: prom/prometheus
container_name: prometheus
volumes:
- './prometheus.yml:/etc/prometheus/prometheus.yml'
networks:
- dbnetwork
ports:
- 9090:9090

# Service for dashboarding service grafana
# This container runs a grafana instance used to display the gathered metrics in dashboards
grafana:
image: grafana/grafana-enterprise
container_name: grafana
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD}
GF_SECURITY_ADMIN_USER: ${GRAFANA_USER}
#GF_SERVER_ROOT is only needed if running behind a reverse proxy and will break grafana if used without it
GF_SERVER_ROOT_URL: https://mastapp.site/monitoring/
networks:
- dbnetwork
ports:
- 3000:3000
volumes:
- '../grafana/provisioning/:/etc/grafana/provisioning/'

networks:
dbnetwork:
name: dbnetwork
Expand Down
6 changes: 6 additions & 0 deletions dev/docker/nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ server {
proxy_pass http://mast-api:5000/;
}

#For monitoring send to grafana login page
location /monitoring/ {
proxy_set_header Host $http_host;
proxy_pass http://grafana:3000/;
}

}


15 changes: 15 additions & 0 deletions dev/docker/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
global:
scrape_interval: 10s
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- prometheus:9090
- job_name: node
static_configs:
- targets:
- node_exporter:9100
- job_name: cadvisor
static_configs:
- targets:
- cadvisor:8080
211 changes: 211 additions & 0 deletions dev/grafana/provisioning/alerting/alert-rules-1760362858236.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
apiVersion: 1
groups:
- orgId: 1
name: FAIRMAST
folder: FAIRMAST_CONTAINERS
interval: 1m
rules:
- uid: cf097gnvqww00e
title: API_Container_Down
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheusdatasource
model:
editorMode: code
expr: absent(container_last_seen{name="mast-api"})
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 3m
isPaused: false
notification_settings:
receiver: FAIRMAST_Monitoring
- uid: cf0d41pp5gzcwa
title: PGADMIN_Container_Down
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheusdatasource
model:
editorMode: code
expr: absent(container_last_seen{name="pgadmin"})
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 3m
isPaused: false
notification_settings:
receiver: FAIRMAST_Monitoring
- orgId: 1
name: FAIRMAST
folder: FAIRMAST_HARDWARE
interval: 1m
rules:
- uid: ff09314seqfpce
title: RAM usage
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheusdatasource
model:
editorMode: code
expr: (1 - (node_memory_MemAvailable_bytes{} / node_memory_MemTotal_bytes{})) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 95
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: RAM usage for the server has exceeded 95% utilisation for over 5 minutes
summary: RAM usage for the server has exceeded 95% utilisation for over 5 minutes
isPaused: false
notification_settings:
receiver: FAIRMAST_Monitoring
- uid: af094q9eheiv4b
title: CPU_Load
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: prometheusdatasource
model:
editorMode: code
expr: scalar(node_load1{}) * 100 / count(count(node_cpu_seconds_total{}) by (cpu))
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 95
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
isPaused: false
notification_settings:
receiver: FAIRMAST_Monitoring
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: 1
contactPoints:
- orgId: 1
name: FAIRMAST_Monitoring
receivers:
- uid: ff0d6b3o3tb0gb
type: teams
settings:
url: DummyURL
disableResolveMessage: false
Loading
Loading