diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8a63237 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.idea +.vscode +.cache +.*vault-pass +.dccache +.DS_Store + +files/sample/* diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..72d019f --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +.PHONY: render +render: + gomplate -f files/etc/prometheus-tailscale-discovery/prometheus.yml.tpl -d tailscale=files/sample/tailscale-status.json > files/sample/prometheus.rendered.yml + +.PHONY: repl +repl: + # If you're focusing on a specific part of the file, grep out the header + watch 'gomplate -f files/etc/prometheus-tailscale-discovery/prometheus.yml.tpl -d tailscale=files/sample/tailscale-status.json' + +.PHONY: lint +lint: render + promtool check config files/sample/prometheus.rendered.yml + +/usr/local/bin/promtool: + VERSION=$(curl -Ls https://api.github.com/repos/prometheus/prometheus/releases/latest | jq ".tag_name" | xargs | cut -c2-) + wget -qO- "https://github.com/prometheus/prometheus/releases/download/v${VERSION}/prometheus-$VERSION.linux-amd64.tar.gz" \ + | tar xvzf - "prometheus-$VERSION.linux-amd64"/promtool --strip-components=1 + sudo mv promtool /usr/local/bin/promtool diff --git a/README.md b/README.md index 4bedbe6..bdcbbbd 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,15 @@ # ansible-role-prometheus-tailscale-discovery -An ansible role that uses tailscale to generate an inventory of hosts to then render a prometheus config. + +> [!WARNING] +> This was copypasta'd from an internal repo and may not work stand-alone. We'll work to make this the version we use too so that we know it works! + +An ansible role that uses tailscale to generate an inventory of hosts to then render a prometheus config. Used by the Hachyderm team to automatically regenerate our prometheus configuration as our fleet changes. + +## Assumptions + +- `tailscale` is installed, up, and the `tailscale` binary is in your user's `$PATH` +- `promtool` is installed (it should be if prometheus is installed), and it's also available on your `$PATH` + +## Using in your environment + +You'll definetly want to update the [prometheus.yml template](files/etc/prometheus-tailscale-discovery/prometheus.yml.tpl) to match your environment. diff --git a/files/etc/prometheus-tailscale-discovery/prometheus.yml.tpl b/files/etc/prometheus-tailscale-discovery/prometheus.yml.tpl new file mode 100644 index 0000000..c0e2527 --- /dev/null +++ b/files/etc/prometheus-tailscale-discovery/prometheus.yml.tpl @@ -0,0 +1,253 @@ +#################################################################### +# Hachyderm Prometheus Config (/etc/prometheus/prometheus.yml) +# +# DO NOT EDIT THIS FILE MANUALLY - IT IS AUTOGENERATED PERIODCALLY +# BY A SYSTEMD TIMER BASED ON TAGS IN TAILSCALE! +# +# Last rendered: {{ time.Now.Format "2006-01-02 15:04:05" }} +#################################################################### + +global: + scrape_interval: 30s + evaluation_interval: 1m + +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them +rule_files: + # - "rules.yml" + + +scrape_configs: +############################################################################### +# SELF - PROMETHEUS +############################################################################### + # first we scrape ourselves + - job_name: "prometheus" + static_configs: + - targets: ["prod-prometheus01:9090"] + +############################################################################### +# SYNTHETICS - BLACKBOX EXPORTER +############################################################################### + # do some blackbox probing to make sure we know when services are up and stuff + - job_name: "blackbox" + metrics_path: /probe + params: + module: [http_2xx_get] + static_configs: + - targets: + - https://hachyderm.io + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: localhost:9115 + +############################################################################### +# NODE EXPORTER (ALL) +############################################################################### + # get some handy metrics from the node exporter (all nodes) + - job_name: "node" + static_configs: + - targets: + - "{{ (ds "tailscale").Self.HostName }}:9100" + labels: + env: prod + ### PROD ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-prod") }} + - "{{ $hostDetail.HostName }}:9100" +{{- end }} +{{- end }} + labels: + env: prod + ### DEV ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-dev") }} + - "{{ $hostDetail.HostName }}:9100" +{{- end }} +{{- end }} + labels: + env: dev + +############################################################################### +# NGINX +############################################################################### + # Scrape nginx exporter and nginxlog exporter for anything running nginx, + # like mastodon-web or edge-cdn + - job_name: "nginx" + static_configs: + ### PROD ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-prod") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-mastodon-web") (coll.Has $hostDetail.Tags "tag:role-edge-cdn") (coll.Has $hostDetail.Tags "tag:role-mastodon-streaming") }} + - "{{ $hostDetail.HostName }}:4040" # nginxlog + - "{{ $hostDetail.HostName }}:9113" +{{- end }} +{{- end }} +{{- end }} + labels: + env: prod + ### DEV ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-dev") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-mastodon-web") (coll.Has $hostDetail.Tags "tag:role-edge-cdn") (coll.Has $hostDetail.Tags "tag:role-mastodon-streaming") }} + - "{{ $hostDetail.HostName }}:4040" # nginxlog + - "{{ $hostDetail.HostName }}:9113" +{{- end }} +{{- end }} +{{- end }} + labels: + env: dev + +############################################################################### +# ELASTICSEARCH STATSD +############################################################################### + # statsd has been deprecated in mastodon-sidekiq, so we're only scraping this from + # the localhost (our mastodon stats exporter). also include elasticsearch nodes. + - job_name: "statsd" + static_configs: + - targets: ["localhost:9876"] # mastodon_api for hachyderm.io + labels: + env: prod + ### PROD ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-prod") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-elasticsearch") }} + - "{{ $hostDetail.HostName }}:9102" +{{- end }} +{{- end }} +{{- end }} + labels: + env: prod + ### DEV ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-dev") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-elasticsearch") }} + - "{{ $hostDetail.HostName }}:9102" +{{- end }} +{{- end }} +{{- end }} + labels: + env: dev + relabel_configs: + - source_labels: [__param_target] + target_label: instance + +############################################################################### +# POSTGRESQL +############################################################################### + # postgresql exporter + - job_name: "postgresql" + static_configs: + ### PROD ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-prod") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-postgres-primary") (coll.Has $hostDetail.Tags "tag:role-postgres-backup") }} + - "{{ $hostDetail.HostName }}:9187" + - "{{ $hostDetail.HostName }}:9930" # pgcat +{{- end }} +{{- end }} +{{- end }} + labels: + env: prod + ### DEV ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-dev") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-postgres-primary") (coll.Has $hostDetail.Tags "tag:role-postgres-backup") }} + - "{{ $hostDetail.HostName }}:9187" + - "{{ $hostDetail.HostName }}:9930" # pgcat +{{- end }} +{{- end }} +{{- end }} + labels: + env: dev + +############################################################################### +# REDIS +############################################################################### + - job_name: "redis" + static_configs: + ### PROD ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-prod") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-redis") }} + - "{{ $hostDetail.HostName }}:9187" +{{- end }} +{{- end }} +{{- end }} + labels: + env: prod + ### DEV ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-dev") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-redis") }} + - "{{ $hostDetail.HostName }}:9187" +{{- end }} +{{- end }} +{{- end }} + labels: + env: dev + +############################################################################### +# MASTODON STREAMING +############################################################################### + - job_name: "streaming" + static_configs: + - targets: ["prod-mastodon-streaming01:4000", "prod-mastodon-streaming01:4001", "prod-mastodon-streaming01:4002"] + labels: + env: "prod" + - targets: ["dev-mastodon01:4000"] + labels: + env: "dev" + metric_relabel_configs: + - source_labels: [ __name__ ] + target_label: __name__ + regex: '(.*)' + action: replace + replacement: mastodon_streaming_${1} + +############################################################################### +# SIDEKIQ +############################################################################### + - job_name: "sidekiq" + static_configs: + ### PROD ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-prod") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-mastodon-sidekiq") }} + - "{{ $hostDetail.HostName }}:9187" +{{- end }} +{{- end }} +{{- end }} + labels: + env: prod + ### DEV ############### + - targets: +{{- range $hostDetail := (ds "tailscale").Peer }} +{{- if and (coll.Has $hostDetail "Tags") (coll.Has $hostDetail.Tags "tag:env-dev") }} +{{- if or (coll.Has $hostDetail.Tags "tag:role-mastodon-sidekiq") }} + - "{{ $hostDetail.HostName }}:9187" +{{- end }} +{{- end }} +{{- end }} + labels: + env: dev diff --git a/files/usr/local/bin/prometheus-write-scrape-config-from-tailscale b/files/usr/local/bin/prometheus-write-scrape-config-from-tailscale new file mode 100755 index 0000000..b525a84 --- /dev/null +++ b/files/usr/local/bin/prometheus-write-scrape-config-from-tailscale @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +TAILSCALE_STATUS_CACHE=/var/cache/tailscale/status.json +PROMETHEUS_CONFIG_TEMPLATE=/etc/prometheus-tailscale-discovery/prometheus.yml.tpl +PROMETHEUS_RENDERED_CONFIG=/etc/prometheus/prometheus.rendered.yml +PROMETHEUS_CONFIG=/etc/prometheus/prometheus.yml +PROMETHEUS_CONFIG_BACKUP=/etc/prometheus/prometheus.yml.bak + +# if template doesn't exist, bail +if [[ ! -f "${PROMETHEUS_CONFIG_TEMPLATE}" ]]; then + echo "ERROR - Prometheus config template does not exist at ${PROMETHEUS_CONFIG_TEMPLATE}. Exiting." + exit 1 +fi + +if [[ ! -f "$TAILSCALE_STATUS_CACHE" ]]; then + echo "ERROR - Tailscale status cache does not exist at ${TAILSCALE_STATUS_CACHE}. Exiting." + exit 1 +fi + +gomplate -f ${PROMETHEUS_CONFIG_TEMPLATE} -d tailscale=${TAILSCALE_STATUS_CACHE} > ${PROMETHEUS_RENDERED_CONFIG} + +sha256() { + sha256sum "$1" | awk '{print $1}' +} + +if [[ -f "${PROMETHEUS_CONFIG}" ]]; then + starting_hash="$(sha256 "${PROMETHEUS_CONFIG}")" +else + starting_hash="" +fi + +echo "Starting Prometheus config sync from Tailscale" +echo "Original ${PROMETHEUS_CONFIG} file hash: ${starting_hash}" + +ending_hash="$(sha256 "${PROMETHEUS_RENDERED_CONFIG}")" + +if [[ "${starting_hash}" == "${ending_hash}" ]]; then + echo "No update" +else + echo "Config needs update. New file hash: ${ending_hash}" + + echo "Backing up config to ${PROMETHEUS_CONFIG_BACKUP}" + cp ${PROMETHEUS_CONFIG} ${PROMETHEUS_CONFIG_BACKUP} + + # TODO here we could run promtool to validate the config before copying it + + echo "Copying rendered config to ${PROMETHEUS_CONFIG}" + cp ${PROMETHEUS_RENDERED_CONFIG} ${PROMETHEUS_CONFIG} + + echo "Sending SIGHUP to Prometheus to reload config" + killall -HUP prometheus +fi diff --git a/files/usr/local/bin/tailscale-write-status-cache b/files/usr/local/bin/tailscale-write-status-cache new file mode 100755 index 0000000..56d61e2 --- /dev/null +++ b/files/usr/local/bin/tailscale-write-status-cache @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +tailscale status --json > /var/cache/tailscale/status.json diff --git a/requirements.yml b/requirements.yml new file mode 100644 index 0000000..a3aaf9e --- /dev/null +++ b/requirements.yml @@ -0,0 +1,4 @@ +--- +collections: + - name: https://github.com/ansible-collections/community.general + type: git diff --git a/tasks/main.yml b/tasks/main.yml new file mode 100644 index 0000000..2eb92ad --- /dev/null +++ b/tasks/main.yml @@ -0,0 +1,69 @@ + + +# Assumptions: prometheus and promtool are already installed and available on $PATH + + +- name: Download latest gomplate binary + get_url: + url: "{{ gomplate_binary_url }}" + dest: /usr/local/bin/gomplate + mode: 0755 + checksum: "sha256:{{ gomplate_binary_sha256 }}" + +- name: Ensure working directory + file: + path: "{{ prometheus_tailscale_directory_config_path }}" + state: directory + +- name: Install helper scripts + ansible.builtin.copy: + src: "{{ item }}" + dest: /usr/local/bin/ + mode: 0755 + with_items: + - files/usr/local/bin/prometheus-write-scrape-config-from-tailscale + - files/usr/local/bin/tailscale-write-status-cache + +- name: Install templates to config dir + copy: + src: etc/prometheus-tailscale-discovery/prometheus.yml.tpl + dest: "{{ prometheus_tailscale_directory_config_path }}" + mode: 0644 + +- name: Install service + template: + src: etc/systemd/system/tailscale-write-status-cache.service + dest: /etc/systemd/system/ + mode: 0644 + +- name: Install timer + template: + src: etc/systemd/system/tailscale-write-status-cache.timer + dest: /etc/systemd/system/ + mode: 0644 + +- name: Install prom scrape config writer service + template: + src: etc/systemd/system/prometheus-write-scrape-config-from-tailscale.service + dest: /etc/systemd/system/ + mode: 0644 + +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: yes + +- name: Enable the cache and prom config writers + ansible.builtin.systemd: + name: "{{ item }}" + enabled: yes + with_items: + - prometheus-write-scrape-config-from-tailscale.service + - tailscale-write-status-cache.service + +- name: Start the timer + ansible.builtin.systemd: + name: "{{ item }}" + state: started + enabled: yes + with_items: + - tailscale-write-status-cache.timer # KICK IT! diff --git a/templates/etc/systemd/system/prometheus-write-scrape-config-from-tailscale.service b/templates/etc/systemd/system/prometheus-write-scrape-config-from-tailscale.service new file mode 100644 index 0000000..2e9c95b --- /dev/null +++ b/templates/etc/systemd/system/prometheus-write-scrape-config-from-tailscale.service @@ -0,0 +1,12 @@ + +[Unit] +After=tailscale-write-status-cache.service +BindsTo=tailscale-write-status-cache.service + +[Install] +WantedBy=tailscale-write-status-cache.service + +[Service] +Type=oneshot +WorkingDirectory=/etc/prometheus +ExecStart=/usr/local/bin/prometheus-write-scrape-config-from-tailscale diff --git a/templates/etc/systemd/system/tailscale-write-status-cache.service b/templates/etc/systemd/system/tailscale-write-status-cache.service new file mode 100644 index 0000000..cb19114 --- /dev/null +++ b/templates/etc/systemd/system/tailscale-write-status-cache.service @@ -0,0 +1,7 @@ +[Unit] +Description=Write a dump of `tailscale status --json` to a /var/cache/tailscale/status.json + +[Service] +Type=oneshot +WorkingDirectory={{ prometheus_tailscale_directory_config_path }} +ExecStart=/usr/local/bin/tailscale-write-status-cache diff --git a/templates/etc/systemd/system/tailscale-write-status-cache.timer b/templates/etc/systemd/system/tailscale-write-status-cache.timer new file mode 100644 index 0000000..f2c88cd --- /dev/null +++ b/templates/etc/systemd/system/tailscale-write-status-cache.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Write a dump of `tailscale status` to a file every 5 minutes + +[Timer] +OnBootSec=30s +OnUnitActiveSec={{ tailscale_write_status_cache_interval }} +RandomizedDelaySec={{ tailscale_write_status_cache_random_splay_seconds }} +Unit=tailscale-write-status-cache.service + +[Install] +WantedBy=timers.target diff --git a/vars/main.yml b/vars/main.yml new file mode 100644 index 0000000..4687ddc --- /dev/null +++ b/vars/main.yml @@ -0,0 +1,9 @@ +--- +tailscale_write_status_cache_interval: 5min # How often to write the status cache to disk, in systemd timer format +tailscale_write_status_cache_random_splay_seconds: 15 # Random splay to add to the status cache write interval, in seconds + +prometheus_tailscale_directory_config_path: /etc/prometheus-tailscale-discovery + +gomplate_version: 4.2.0 +gomplate_binary_url: https://github.com/hairyhenderson/gomplate/releases/download/v{{ gomplate_version }}/gomplate_linux-{{ system_processor_architecture }} +gomplate_binary_sha256: e095900ac880889cb9a56777448083048e4517a50d08b3db96f80c43db65710c