diff --git a/setup-coder.sh b/setup-coder.sh index 799a0a2..08cf5ec 100755 --- a/setup-coder.sh +++ b/setup-coder.sh @@ -460,13 +460,44 @@ else fi fi -if run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && command -v node' &>/dev/null; then - info "Node.js already available for ${CODER_USER}: $(run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && node --version')" +# Check if nvm has a node version installed (not just system node) +# "nvm current" returns "system" or "none" when no nvm-managed node is active +_nvm_current="$(run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && nvm current' 2>/dev/null || echo "none")" +if [[ "$_nvm_current" != "system" && "$_nvm_current" != "none" && "$_nvm_current" != "N/A" ]]; then + info "Node.js already available via nvm for ${CODER_USER}: ${_nvm_current}" else + info "Installing Node.js LTS via nvm..." run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && nvm install --lts' info "Node.js installed: $(run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && node --version')" fi +# Node.js 24+ requires C++20 for native module compilation (node-gyp) +# Configure gcc-13 with C++20 flags if needed +_node_major="$(run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && node --version' 2>/dev/null | sed 's/^v//' | cut -d. -f1)" +if [[ "$_node_major" -ge 24 ]] 2>/dev/null; then + # Ensure gcc-13 is installed (supports C++20) + if ! command -v g++-13 &>/dev/null; then + info "Installing g++-13 for Node ${_node_major} C++20 requirement..." + apt-get install -y --no-install-recommends g++-13 gcc-13 + fi + + # Add C++20 compiler settings to coder's environment if not already set + if ! grep -q 'CXX=.*g++-13' "${CODER_HOME}/.bashrc" 2>/dev/null; then + info "Configuring C++20 compiler for native modules..." + cat >> "${CODER_HOME}/.bashrc" << 'CXXFLAGS_EOF' + +# C++20 compiler settings for Node.js 24+ native modules (node-gyp) +export CXX="g++-13" +export CC="gcc-13" +export CXXFLAGS="-std=c++20" +CXXFLAGS_EOF + chown "${CODER_USER}:${CODER_USER}" "${CODER_HOME}/.bashrc" + info "C++20 compiler configured (g++-13)" + else + info "C++20 compiler already configured" + fi +fi + # --------------------------------------------------------------------------- # Step 5: Install Python ecosystem (uv + ruff + qt-mcp) # --------------------------------------------------------------------------- @@ -901,7 +932,13 @@ if [[ -n "${AITOOLS_DIR}" && -d "${AITOOLS_DIR}/coder" ]]; then chown -R "${CODER_USER}:${CODER_USER}" "${_coder_dst}" info "Running npm install for coder-mcp..." - run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && cd ~/tools/coder && npm install --omit=dev' + # For Node 24+, explicitly set C++20 compiler (bashrc not sourced in non-interactive shells) + _node_major="$(run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && node --version' 2>/dev/null | sed 's/^v//' | cut -d. -f1)" + if [[ "$_node_major" -ge 24 ]] 2>/dev/null && command -v g++-13 &>/dev/null; then + run_as_coder 'export CXX=g++-13 CC=gcc-13 CXXFLAGS="-std=c++20" NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && cd ~/tools/coder && npm install --omit=dev' + else + run_as_coder 'export NVM_DIR="$HOME/.nvm" && . "$NVM_DIR/nvm.sh" && cd ~/tools/coder && npm install --omit=dev' + fi info "coder-mcp installed" fi else @@ -1092,8 +1129,8 @@ else fi # --- Git identity (--global defaults; override per-repo with git config user.name/email) --- -_current_git_name="$(sudo -Hu "$CODER_USER" git config --global user.name 2>/dev/null || true)" -_current_git_email="$(sudo -Hu "$CODER_USER" git config --global user.email 2>/dev/null || true)" +_current_git_name="$(sudo -Hu "$CODER_USER" bash -c 'cd && git config --global user.name' 2>/dev/null || true)" +_current_git_email="$(sudo -Hu "$CODER_USER" bash -c 'cd && git config --global user.email' 2>/dev/null || true)" if [[ -n "$_current_git_name" && -n "$_current_git_email" ]]; then info "Git identity: ${_current_git_name} <${_current_git_email}>" @@ -1121,15 +1158,15 @@ else read -r _git_email _git_email="${_git_email:-$_git_email_default}" fi - sudo -Hu "$CODER_USER" git config --global user.name "$_git_name" - sudo -Hu "$CODER_USER" git config --global user.email "$_git_email" + sudo -Hu "$CODER_USER" bash -c "cd && git config --global user.name '$_git_name'" + sudo -Hu "$CODER_USER" bash -c "cd && git config --global user.email '$_git_email'" info "Git identity: ${_git_name} <${_git_email}>" fi # --- SSH commit signing --- # Only offer when SSH public keys were discovered from the primary user if [[ ${#_pub_keys[@]} -gt 0 ]]; then - _current_signing_key="$(sudo -Hu "$CODER_USER" git config --global user.signingkey 2>/dev/null || true)" + _current_signing_key="$(sudo -Hu "$CODER_USER" bash -c 'cd && git config --global user.signingkey' 2>/dev/null || true)" if [[ -n "$_current_signing_key" ]]; then info "SSH signing already configured: ${_current_signing_key}" @@ -1181,10 +1218,10 @@ if [[ ${#_pub_keys[@]} -gt 0 ]]; then chown "${CODER_USER}:${CODER_USER}" "$_coder_sign_key" chmod 644 "$_coder_sign_key" - sudo -Hu "$CODER_USER" git config --global gpg.format ssh - sudo -Hu "$CODER_USER" git config --global user.signingkey "$_coder_sign_key" - sudo -Hu "$CODER_USER" git config --global commit.gpgsign true - sudo -Hu "$CODER_USER" git config --global tag.gpgsign true + sudo -Hu "$CODER_USER" bash -c "cd && git config --global gpg.format ssh" + sudo -Hu "$CODER_USER" bash -c "cd && git config --global user.signingkey '$_coder_sign_key'" + sudo -Hu "$CODER_USER" bash -c "cd && git config --global commit.gpgsign true" + sudo -Hu "$CODER_USER" bash -c "cd && git config --global tag.gpgsign true" info "SSH signing enabled: ${_sign_basename} → commits & tags" fi fi @@ -1304,10 +1341,10 @@ check() { shift if "$@" &>/dev/null; then info "PASS: ${desc}" - (( PASS++ )) + (( ++PASS )) || true else err "FAIL: ${desc}" - (( FAIL++ )) + (( ++FAIL )) || true fi } @@ -1359,23 +1396,25 @@ check "cratedex.service is active" \ check "${CODER_USER} has ruff" \ test -x "${CODER_HOME}/.local/bin/ruff" -# Node.js +# Node.js (use -H to set HOME correctly for nvm) check "${CODER_USER} has node" \ - bash -c "sudo -u ${CODER_USER} -i bash -c 'export NVM_DIR=\"\$HOME/.nvm\" && . \"\$NVM_DIR/nvm.sh\" && node --version'" + bash -c "sudo -Hu ${CODER_USER} bash -c 'export NVM_DIR=\"\$HOME/.nvm\" && . \"\$NVM_DIR/nvm.sh\" && node --version'" # Agent CLIs check "${CODER_USER} has Claude Code CLI" \ - bash -c "sudo -u ${CODER_USER} -i bash -c 'command -v claude'" + sudo -u "${CODER_USER}" test -x "${CODER_HOME}/.local/bin/claude" check "${CODER_USER} has OpenAI Codex CLI" \ - bash -c "sudo -u ${CODER_USER} -i bash -c 'export NVM_DIR=\"\$HOME/.nvm\" && . \"\$NVM_DIR/nvm.sh\" && command -v codex'" + bash -c "sudo -Hu ${CODER_USER} bash -c 'export NVM_DIR=\"\$HOME/.nvm\" && . \"\$NVM_DIR/nvm.sh\" && command -v codex'" check "${CODER_USER} has Google Gemini CLI" \ - bash -c "sudo -u ${CODER_USER} -i bash -c 'export NVM_DIR=\"\$HOME/.nvm\" && . \"\$NVM_DIR/nvm.sh\" && command -v gemini'" + bash -c "sudo -Hu ${CODER_USER} bash -c 'export NVM_DIR=\"\$HOME/.nvm\" && . \"\$NVM_DIR/nvm.sh\" && command -v gemini'" -# MCP tools -check "coder-mcp installed" \ - test -f "${CODER_HOME}/tools/coder/bin/coder-mcp.js" +# MCP tools (only check coder-mcp if AITOOLS_DIR was set or it was installed manually) +if [[ -n "${AITOOLS_DIR:-}" || -d "${CODER_HOME}/tools/coder" ]]; then + check "coder-mcp installed" \ + test -f "${CODER_HOME}/tools/coder/bin/coder-mcp.js" +fi check "github-mcp image available" \ run_as_coder 'podman image exists ghcr.io/github/github-mcp-server' diff --git a/setup-homelab.sh b/setup-homelab.sh index 215afde..a194279 100755 --- a/setup-homelab.sh +++ b/setup-homelab.sh @@ -102,7 +102,14 @@ confirm() { } run_as_coder() { - sudo -Hu "$CODER_USER" bash -lc "cd \"\$HOME\" && $1" + # Always run from coder's HOME (avoids "cannot chdir" permission issues) + # Also ensure XDG_RUNTIME_DIR exists for rootless podman commands. + sudo -Hu "$CODER_USER" bash -lc ' + set -e + export XDG_RUNTIME_DIR="/run/user/$(id -u)" + cd "$HOME" + '"$1"' + ' } # --------------------------------------------------------------------------- @@ -141,10 +148,16 @@ do_rollback() { systemctl restart systemd-resolved || true info "Restarted systemd-resolved" - # Stop homelab pod (Podman Quadlet services under coder user) + # Stop homelab services (Podman Quadlet services under coder user) if id "$CODER_USER" &>/dev/null; then + # Try pod services (Podman 5.x+) systemctl --user -M "${CODER_USER}@" stop homelab-pod.service 2>/dev/null || true - info "Stopped homelab pod" + systemctl --user -M "${CODER_USER}@" stop pod-homelab.service 2>/dev/null || true + # Try individual services (Podman 4.x) + systemctl --user -M "${CODER_USER}@" stop pihole.service 2>/dev/null || true + systemctl --user -M "${CODER_USER}@" stop traefik.service 2>/dev/null || true + + info "Stopped homelab services" fi # Re-enable nginx @@ -202,6 +215,11 @@ fi PODMAN_VERSION="$(podman --version 2>/dev/null | awk '{print $NF}')" info "Podman ${PODMAN_VERSION}" +# Note: We use a shared Podman network instead of Pod= for two reasons: +# 1. Pod= requires Podman 5.0+ (not available in many distros yet) +# 2. Pi-hole v6 ignores WEB_PORT and always listens on 80, conflicting with Traefik in a pod +info "Using shared Podman network for container communication" + if ! command -v crun &>/dev/null; then err "crun is not installed. Install with: apt-get install crun" exit 1 @@ -306,10 +324,17 @@ chmod 2775 "${HOMELAB_DIR}" chmod 2775 "${HOMELAB_DIR}/traefik/dynamic" chmod 2775 "${HOMELAB_DIR}/services" -# Ensure coder user can write to ACME and Pi-hole data dirs +# Ensure coder user can write to ACME dir chown -R "${CODER_USER}:homelab" "${HOMELAB_DIR}/traefik/acme" -chown -R "${CODER_USER}:homelab" "${HOMELAB_DIR}/pihole/etc-pihole" -chown -R "${CODER_USER}:homelab" "${HOMELAB_DIR}/pihole/etc-dnsmasq.d" + +# Pi-hole runs as UID 1000 inside the container. For rootless Podman, we need +# to set ownership using the mapped UID. Use podman unshare to do this correctly. +# First set base ownership so coder can access, then fix for container UID mapping. +chown -R "${CODER_USER}:homelab" "${HOMELAB_DIR}/pihole" +run_as_coder "podman unshare chown -R 1000:1000 ${HOMELAB_DIR}/pihole/etc-pihole" || \ + warn "Could not set pihole ownership via podman unshare (will retry on container start)" +run_as_coder "podman unshare chown -R 0:0 ${HOMELAB_DIR}/pihole/etc-dnsmasq.d" || \ + warn "Could not set dnsmasq.d ownership via podman unshare" info "Directories created (group: homelab)" @@ -452,7 +477,11 @@ http: DASHBOARD_YML info "Created dashboard.yml" -# Pi-hole admin UI route (container shares pod network — localhost:80) +# Pi-hole admin UI route +# Traefik reaches Pi-hole via the shared 'homelab' network using container name +# Note: Pi-hole v6 ignores WEB_PORT env var and always listens on 80/443 +PIHOLE_BACKEND_URL="http://pihole:80" + cat > "${HOMELAB_DIR}/traefik/dynamic/pihole.yml" << PIHOLE_YML http: routers: @@ -467,9 +496,9 @@ http: pihole: loadBalancer: servers: - - url: "http://localhost:8080" + - url: "${PIHOLE_BACKEND_URL}" PIHOLE_YML -info "Created pihole.yml" +info "Created pihole.yml (backend: ${PIHOLE_BACKEND_URL})" # Ollama route (reaches host via LAN IP — separate container, not in the pod) cat > "${HOMELAB_DIR}/traefik/dynamic/ollama.yml" << OLLAMA_YML @@ -592,15 +621,50 @@ if [[ ! -d "$_xdg_runtime" ]]; then chmod 700 "$_xdg_runtime" fi -# Create secrets idempotently (--replace overwrites if exists) -printf '%s' "${CF_DNS_API_TOKEN}" | run_as_coder 'podman secret create --replace cf-dns-token -' -info "Podman secret: cf-dns-token" +create_secret_safe() { + local name="$1" + local value="$2" + + # Do NOT use --replace; it triggers a delete path that can fail if store is inconsistent. + # Instead: try rm (ignore errors), then create. + run_as_coder "podman secret rm ${name} >/dev/null 2>&1 || true" + + if printf '%s' "${value}" | run_as_coder "podman secret create ${name} -"; then + info "Podman secret: ${name}" + return 0 + fi + + warn "Failed to create secret '${name}'. Repairing coder secret store and retrying..." + + # Stop services using secrets (best-effort, try both pod and individual services) + systemctl --user -M "${CODER_USER}@" stop homelab-pod.service 2>/dev/null || true + systemctl --user -M "${CODER_USER}@" stop pihole.service 2>/dev/null || true + systemctl --user -M "${CODER_USER}@" stop traefik.service 2>/dev/null || true + + # Backup state DB (best-effort) + run_as_coder 'cp -a ~/.local/share/containers/storage/libpod/bolt_state.db ~/.local/share/containers/storage/libpod/bolt_state.db.bak.$(date +%s) 2>/dev/null || true' -printf '%s' "${PIHOLE_PASSWORD}" | run_as_coder 'podman secret create --replace pihole-password -' -info "Podman secret: pihole-password" + # Wipe only secrets payload + any stale secret metadata files (best-effort) + run_as_coder 'rm -rf ~/.local/share/containers/storage/secrets' + run_as_coder 'mkdir -p ~/.local/share/containers/storage/secrets' -printf '%s' "${TRAEFIK_DASHBOARD_USERS}" | run_as_coder 'podman secret create --replace traefik-dashboard-users -' -info "Podman secret: traefik-dashboard-users" + # Retry: rm ignore + create + run_as_coder "podman secret rm ${name} >/dev/null 2>&1 || true" + if printf '%s' "${value}" | run_as_coder "podman secret create ${name} -"; then + info "Podman secret: ${name} (after repair)" + return 0 + fi + + err "Still failed to create secret '${name}' after repair." + err "Try the nuclear reset for coder rootless podman state:" + err " systemctl --user -M coder@ stop homelab-pod.service pihole.service traefik.service 2>/dev/null || true" + err " sudo -Hu coder bash -lc 'cd ~ && podman system reset -f'" + return 1 +} + +create_secret_safe "cf-dns-token" "${CF_DNS_API_TOKEN}" +create_secret_safe "pihole-password" "${PIHOLE_PASSWORD}" +create_secret_safe "traefik-dashboard-users" "${TRAEFIK_DASHBOARD_USERS}" # --------------------------------------------------------------------------- # Step 9: Write Podman Quadlet files @@ -611,19 +675,6 @@ step "Writing Podman Quadlet files" QUADLET_DIR="${CODER_HOME}/.config/containers/systemd" mkdir -p "$QUADLET_DIR" -# --- homelab.pod --- -cat > "${QUADLET_DIR}/homelab.pod" << POD_FILE -[Pod] -PodName=homelab -PublishPort=80:80 -PublishPort=443:443 -PublishPort=${LAN_IP}:53:53/tcp -PublishPort=${LAN_IP}:53:53/udp -PublishPort=127.0.0.1:53:53/tcp -PublishPort=127.0.0.1:53:53/udp -POD_FILE -info "Written homelab.pod" - # --- traefik-acme.volume --- cat > "${QUADLET_DIR}/traefik-acme.volume" << VOL_FILE [Volume] @@ -636,15 +687,27 @@ cat > "${QUADLET_DIR}/traefik-logs.volume" << VOL_FILE VolumeName=traefik-logs VOL_FILE -# --- traefik.container --- -cat > "${QUADLET_DIR}/traefik.container" << TRAEFIK_QUADLET +# Remove any leftover pod file from previous installs (pods not used due to port conflicts) +rm -f "${QUADLET_DIR}/homelab.pod" + +# --- homelab.network --- + cat > "${QUADLET_DIR}/homelab.network" << NETWORK_QUADLET +[Network] +NetworkName=homelab +NETWORK_QUADLET + info "Written homelab.network" + + # --- traefik.container --- + cat > "${QUADLET_DIR}/traefik.container" << TRAEFIK_QUADLET [Unit] Description=Traefik reverse proxy + TLS termination [Container] Image=${TRAEFIK_IMAGE} ContainerName=traefik -Pod=homelab.pod +Network=homelab.network +PublishPort=80:80 +PublishPort=443:443 Secret=cf-dns-token,type=env,target=CF_DNS_API_TOKEN Volume=${HOMELAB_DIR}/traefik/traefik.yml:/etc/traefik/traefik.yml:ro Volume=${HOMELAB_DIR}/traefik/dynamic:/etc/traefik/dynamic:ro @@ -654,26 +717,31 @@ Volume=traefik-logs.volume:/var/log/traefik [Install] WantedBy=default.target TRAEFIK_QUADLET -info "Written traefik.container" + info "Written traefik.container" -# --- pihole.container --- -# Pi-hole's web UI runs on port 80 inside the container, but the pod already -# publishes 80 for Traefik. We remap Pi-hole's web to 8080 inside the pod -# namespace (Traefik's pihole.yml routes to localhost:8080). -cat > "${QUADLET_DIR}/pihole.container" << PIHOLE_QUADLET + # --- pihole.container --- + # Pi-hole web UI is accessed via Traefik (http://pihole:80 on internal network) + # Only DNS ports need to be published to the host + # Requires network-online to ensure LAN_IP is available before binding + cat > "${QUADLET_DIR}/pihole.container" << PIHOLE_QUADLET [Unit] Description=Pi-hole DNS + ad blocker +After=network-online.target +Wants=network-online.target [Container] Image=${PIHOLE_IMAGE} ContainerName=pihole -Pod=homelab.pod +Network=homelab.network +PublishPort=${LAN_IP}:53:53/tcp +PublishPort=${LAN_IP}:53:53/udp +PublishPort=127.0.0.1:53:53/tcp +PublishPort=127.0.0.1:53:53/udp Secret=pihole-password,type=env,target=WEBPASSWORD Environment=FTLCONF_misc_etc_dnsmasq_d=true Environment=FTLCONF_dns_upstreams=${UPSTREAM_DNS} Environment=PIHOLE_DNS_=${UPSTREAM_DNS} Environment=TZ=${TZ} -Environment=WEB_PORT=8080 Volume=${HOMELAB_DIR}/pihole/etc-pihole:/etc/pihole Volume=${HOMELAB_DIR}/pihole/etc-dnsmasq.d:/etc/dnsmasq.d @@ -803,16 +871,51 @@ fi step "Starting Traefik + Pi-hole" -# Reload systemd user daemon to pick up new Quadlet files, then start the pod +# Reload systemd user daemon to pick up new Quadlet files systemctl --user -M "${CODER_USER}@" daemon-reload -systemctl --user -M "${CODER_USER}@" start homelab-pod.service + +# Give systemd a moment to process the quadlet files +sleep 2 + +# Start traefik, then pihole (they share 'homelab' network) +if systemctl --user -M "${CODER_USER}@" start traefik.service; then + info "Started traefik.service" + if systemctl --user -M "${CODER_USER}@" start pihole.service; then + info "Started pihole.service" + else + warn "Failed to start pihole.service — check logs with: sudo -u coder journalctl --user -u pihole.service" + fi +else + err "Failed to start traefik.service. Available units:" + systemctl --user -M "${CODER_USER}@" list-unit-files | grep -iE 'homelab|traefik|pihole' || true + err "Check quadlet generator output:" + run_as_coder '/usr/libexec/podman/quadlet --dryrun --user 2>&1' | head -20 || true + exit 1 +fi # Wait for containers to be healthy info "Waiting for containers to start..." sleep 5 -run_as_coder 'podman pod ps' || true -run_as_coder 'podman ps --pod --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}\t{{.Pod}}"' || true +run_as_coder 'podman ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"' || true + +# Wait for Pi-hole's FTL DNS service to be ready (may take time on first boot for gravity db) +info "Waiting for Pi-hole DNS to be ready..." +PIHOLE_READY=false +for i in {1..30}; do + if run_as_coder 'podman exec pihole pihole status 2>/dev/null' | grep -q "DNS service is running"; then + PIHOLE_READY=true + info "Pi-hole DNS is ready" + break + fi + sleep 2 +done + +if [[ "$PIHOLE_READY" != "true" ]]; then + warn "Pi-hole DNS may not be fully ready yet. If DNS doesn't work, try:" + warn " sudo -u coder podman restart pihole" + warn " # Wait 30 seconds, then test: nslookup test.${DOMAIN} 127.0.0.1" +fi # --------------------------------------------------------------------------- # Step 15: Register services