diff --git a/.gitignore b/.gitignore index 33c1d12..6bdbbce 100644 --- a/.gitignore +++ b/.gitignore @@ -124,9 +124,7 @@ celerybeat.pid # Environments .env .venv -#env/ venv/ -ENV/ env.bak/ venv.bak/ @@ -160,3 +158,15 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Data Folder +data/* +# TODO: Remove when you need the cut background file anymore +# Raw data folder excluding only the background cut file +# data/raw/ + +# Modules Pool +*.cfmodule + +# OS +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 58ff8bc..ec6cd4a 100644 --- a/README.md +++ b/README.md @@ -1,101 +1,239 @@ -# cosiflow +# Cosiflow -The COSI SDOC pipeline based on Apache Airflow +Cosiflow provides an Airflow-based orchestration environment for managing and monitoring scientific pipelines for COSI. -## Build the cosiflow docker +--- -We assume that the cosiflow repository is in your $HOME directory. +## Quick Start Overview -```bash -cd $HOME/cosiflow/env -``` +There are two main steps to using Cosiflow: -Mac: +1. **Set up and run the core Cosiflow environment** (this README): + - Configure `env/docker-compose.yaml` (UID/GID, Airflow admin password, ports, etc.). + - Build and start the Docker Compose stack. + - Access the Airflow Web UI. -```bash -docker build --platform linux/arm64 -t airflow:1.0.0 -f Dockerfile . -``` +2. **(Optional) Install one or more pipeline modules** (e.g. analysis pipelines): + - Each module lives alongside `cosiflow/` (for example `fast-transient-analysis-pipeline/`). + - Modules provide DAGs and pipeline scripts that are plugged into the running Cosiflow instance. + - To install a module into Cosiflow, follow the guide in `env/README.md`. -Linux: +If you only want to bring up a plain Cosiflow environment to explore Airflow and the COSIDAG framework, step 1 is sufficient. +If you want to run a specific scientific pipeline (e.g. the Fast Transient Analysis Pipeline), you must also complete step 2. -```bash -docker build -t airflow:1.1.0 -f Dockerfile . -``` +--- -## Execute the docker compose to start containers +### 1. REQUIREMENTS -```bash -docker compose up -d -``` +#### CONFIGURE ENVIRONMENT VARIABLES (NO `.env` FILE) -If you want to enter into the postgre docker container: `docker compose exec postgres bash` +All configuration is now done directly in `env/docker-compose.yaml` (there is **no** `.env` file anymore). -If you want to enter into the postgre docker container: `docker compose exec airflow bash` +1. Move into the `env` folder: + ```bash + cd env + ``` -## Connect to the web server using a browser +2. Find your local user and group IDs: -localhost:8080 + ```bash + id -u # YOUR_USER_ID + id -g # YOUR_GROUP_ID + ``` -Note: if you use a remote server you can change the `docker-compose.yaml` file to use another port. +3. Open `docker-compose.yaml` and locate the `x-common-env` block at the top. + Replace the default values with your IDs: -For example: - - ```yaml - ports: - - "28080:8080" - ``` + ```yaml + UID: ${UID:-} # TOEDIT + GID: ${GID:-} # TOEDIT + ``` + +4. In the same `x-common-env` block, set a secure password for the Airflow Web UI: + + ```yaml + AIRFLOW_ADMIN_PASSWORD: ${AIRFLOW_ADMIN_PASSWORD:-} # TOEDIT + ``` + +5. (Optional, but recommended to review) + Still in `x-common-env`, check the variables marked with `# TOEDIT` comments + (e.g. `HOST_IP`, `MAILHOG_WEBUI_PORT`, `AIRFLOW_WEBUI_PORT`) and adjust them + if the defaults are not suitable for your setup. -then from your local pc you can forward the port in this way: +#### PREPARE THE FOLDER FOR STORING POSTGRESS DATA +```bash +cd .. +mkdir -p data/postgres_data +``` + +--- + +### 2. BUILD THE COMPOSE + +Build all containers defined in `docker-compose.yml`: ```bash -ssh -N -L 28080:localhost:28080 [user]@[remote machine] +cd env +docker compose build ``` -and open the airflow webpace from your local pc at `localhost:28080` +⏱ Estimated build time: **~490 seconds** + +--- -Login with username: `admin` password: `` +### 3. RUN THE CONTAINER -To obtain the password `` execute this command after the initialization of the containers +To run with logs visible: ```bash -docker compose logs | grep pass +docker compose up ``` -### Shutdown the dockers +To run in detached mode (no logs): ```bash -docker compose down -v +docker compose up -d ``` -## Test the cosipy DAG +--- + +### 4. ENTER THE CONTAINER -Enter in the docker airflow +To open a terminal inside the running Airflow container: ```bash docker compose exec airflow bash ``` -First download the data file from wasabi. +--- -```bash -cd /shared_dir/pipeline -source activate cosipy -python initialize_pipeline.py -``` +### 5. CONNECT TO THE AIRFLOW WEB UI + +1. Open your web browser and go to: + + [http://localhost:8080/home](http://localhost:8080/home) -This script downloads the input file from wasabi and move it in `/home/gamma/workspace/data` +2. Insert the user credentials: + ```text + user: admin + password: + ``` -Now we must activate the DAG named `"cosipt_test_v0"` from the airflow website +--- -Then we have to copy the file in the input directory to trigger the DAG +### 6. STOP THE CONTAINER + +To stop and remove all running containers, networks, and volumes: ```bash -cd /home/gamma/workspace/data -cp GalacticScan.inc1.id1.crab2hr.extracted.tra.gz input +docker compose down -v ``` -We should see that the DAG started to process the data. +--- + +### 7. CONFIGURATIONS + +Below is the list of the main environment variables configured in `env/docker-compose.yaml` +inside the `x-common-env` block (and related sections), with their purpose: + +| Variable | Description | +|-----------|--------------| +| **UID** | User ID used inside containers (must match your local user) | +| **GID** | Group ID used inside containers (must match your local group) | +| **DISPLAY** | Display variable for X11 forwarding (optional) | +| **AIRFLOW_ADMIN_USERNAME** | Default Airflow Web UI username | +| **AIRFLOW_ADMIN_EMAIL** | Email associated with Airflow admin user | +| **AIRFLOW_ADMIN_PASSWORD** | Secure password for Airflow Web UI (must be set by you) | +| **HOST_IP** | Host IP used to construct service URLs (e.g. Web UIs) | +| **MAILHOG_WEBUI_PORT** | Port for the MailHog Web UI | +| **AIRFLOW_WEBUI_PORT** | Port for the Airflow Web UI | +| **POSTGRES_USER** | Username for the Airflow PostgreSQL database | +| **POSTGRES_DB** | Database name for the Airflow PostgreSQL database | +| **POSTGRES_PASSWORD** | Password for the Airflow PostgreSQL database | +| **ALERT_USERS_LIST_PATH** | Path to YAML file containing user alert configurations | +| **ALERT_SMTP_SERVER** | SMTP server used for alert notifications | +| **ALERT_EMAIL_SENDER** | Email address used as sender for system alerts | +| **ALERT_LOG_PATH** | Path to Airflow log file monitored by alert system | +| **AIRFLOW__SMTP__SMTP_STARTTLS** | Enables/disables STARTTLS (default: False) | +| **AIRFLOW__SMTP__SMTP_SSL** | Enables/disables SMTP over SSL (default: False) | +| **COSI_DATA_DIR** | Root directory for COSI data | +| **COSI_INPUT_DIR** | Directory for COSI input data | +| **COSI_LOG_DIR** | Directory for COSI log files | +| **COSI_OBS_DIR** | Directory for observation data | +| **COSI_TRANSIENT_DIR** | Directory for transient event data | +| **COSI_TRIGGER_DIR** | Directory for trigger event data | +| **COSI_MAPS_DIR** | Directory for map data products | +| **COSI_SOURCE_DIR** | Directory for source-level data products | + +--- + +### NOTES + +- Configuration is done directly in `env/docker-compose.yaml`; there is no `.env` file. +- Variables that are important to customize are explicitly marked with `# TOEDIT` comments in `docker-compose.yaml`. +- To inspect container logs, use: + ```bash + docker compose logs -f airflow + ``` + +- For details on how to install and manage modules (pipelines) within Cosiflow, see `env/README.md`. + +--- + +**Cosiflow environment ready for use.** + +--- + +## What is COSIDAG + +A **COSIDAG** (COSI DAG) is a structured abstraction built on top of Apache Airflow DAGs. + +It provides a **standardized workflow layout** for scientific pipelines, reducing boilerplate and enforcing consistent patterns across different analyses. + +In particular, a COSIDAG: + +* defines a common execution skeleton (input resolution, optional monitoring, result handling) +* encapsulates best practices for: + + * file discovery + * parameter propagation + * XCom-based communication +* allows developers to focus only on **scientific tasks**, while orchestration logic is handled automatically + +COSIDAGs are used for all production scientific pipelines (e.g. Light Curve, TS Map), while standard DAGs are reserved for orchestration, testing, or utilities. + +**How to write and customize a COSIDAG** is explained in detail in the [tutorial section](modules/README.md). + +--- + +## Tutorials and developer guide + +A complete, step-by-step guide on how to: + +* understand the COSIDAG execution model +* write new COSIDAGs +* add custom tasks +* use XCom correctly +* integrate external Python environments + +is available in: + +[tutorial section](tutorials/README.md). + +This is the **recommended starting point for developers**. + +--- + +## Available DAGs and COSIDAGs + +A complete and up-to-date list of all DAGs and COSIDAGs implemented in this repository — including: + +* workflow purpose +* inputs and outputs +* task structure +* operators used +* XCom usage -This directory `/home/gamma/workspace/heasarc/dl0` contains several folders with this format `2025-01-24_14-31-56`. +is documented in: [DAG and COSIDAG LIST README](dags/README.md) -Inside the folder we have the results of the analysis. +This document serves as the **catalog and reference** for all workflows available in Cosiflow. \ No newline at end of file diff --git a/callbacks/__init__.py b/callbacks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/callbacks/on_failure_callback.py b/callbacks/on_failure_callback.py new file mode 100644 index 0000000..3868b40 --- /dev/null +++ b/callbacks/on_failure_callback.py @@ -0,0 +1,82 @@ +import yaml +import os +from airflow.utils.email import send_email +import urllib.parse + +ALERT_CONFIG_PATH = os.getenv("ALERT_USERS_LIST_PATH", "/home/gamma/env/alert_users.yaml") + + +def load_alert_config(): + with open(ALERT_CONFIG_PATH, "r") as f: + return yaml.safe_load(f) + + +def get_recipients(keyword: str) -> list[str]: + config = load_alert_config() + matched_groups = set() + + for rule in config.get("rules", []): + if rule["pattern"] == keyword: + matched_groups.update(rule["notify"]) + + emails = set() + for group in matched_groups: + group_data = config["groups"].get(group) + if group_data: + emails.update(group_data.get("emails", [])) + return sorted(emails) + + + +def notify_email(context): + task = context["task_instance"] + dag_id = task.dag_id + task_id = task.task_id + run_id = task.run_id + execution_date = context.get("execution_date") + + # URL-encode i parametri per sicurezza + base_url = "http://localhost:8080" + query = urllib.parse.urlencode({ + "execution_date": execution_date.isoformat(), + "tab": "logs", + "dag_run_id": run_id, + "task_id": task_id + }) + log_url = f"{base_url}/dags/{dag_id}/grid?{query}" + + # Percorso log locale (personalizzabile) + log_path = f"/home/gamma/airflow/logs/dag_id={dag_id}/run_id={run_id}/task_id={task_id}/attempt=1.log" + if not os.path.exists(log_path): + log_preview = "⚠️ Log file not found." + else: + with open(log_path, "r") as f: + lines = f.readlines()[-30:] # Ultime 30 righe + log_preview = "".join(lines) + log_preview = log_preview.replace("<", "<").replace(">", ">") # Escaping HTML + + recipients = get_recipients("ALERT_FAIL") + if not recipients: + return # no recipients, skip + + subject = f"[ALERT] Task {task.task_id} in DAG {task.dag_id} has failed" + html_content = f""" + + +

⚠️ Task Failure Alert

+ + + + + +
DAG:{dag_id}
Task:{task_id}
Execution Time:{execution_date}
Log URL:{log_url}
+

🔍 Log Preview

+
+{log_preview}
+        
+

Full log available at the link above.

+ + + """ + + send_email(to=recipients, subject=subject, html_content=html_content) diff --git a/dags/README.md b/dags/README.md new file mode 100644 index 0000000..0de2a4a --- /dev/null +++ b/dags/README.md @@ -0,0 +1,563 @@ +# DAG & COSIDAG Catalog + +This document describes all **DAGs** and **COSIDAGs** available in this repository, including: + +* DAG title +* workflow type (DAG vs COSIDAG) +* purpose +* inputs / outputs +* number of tasks (and task layout) +* operator types +* XCom usage (inter-task communication) + +--- + +## COSIDAG framework (module) + +### File + +`cosidag.py` + +### What it is + +This is **not a DAG**. +It defines the **`COSIDAG`** convenience class used by multiple pipelines. + +### Standard COSIDAG layout + +A COSIDAG wires a common pattern: + +1. `check_new_file` *(optional)* +2. `automatic_retrig` *(optional)* +3. `resolve_inputs` *(optional)* +4. `[custom tasks]` +5. `show_results` *(always)* + +### Operators used internally + +* `PythonOperator` +* `EmptyOperator` +* `TriggerDagRunOperator` +* (also sensors / utilities internally, depending on configuration) + +### XCom contract (key design) + +* `detected_folder` is pushed by `check_new_file` (or by a fallback setter task when monitoring is disabled) +* resolved inputs are pushed by `resolve_inputs` using keys from `file_patterns` +* `show_results` reads `detected_folder` from XCom and prints a results URL (if configured) + +✅ This file is the **contract** that all COSIDAG-based pipelines rely on. + +--- + +# Entry-point DAGs + +## `init_pipelines` + +### File + +`cosipipe_simdata.py` (header indicates: `# dags/init_pipelines.py`) + +### Type + +**Standard DAG** (entry-point / initializer) + +### Workflow purpose + +Single entry point that: + +* prepares/stages raw inputs +* resolves configuration +* creates the run/products directory +* optionally performs a background cut +* creates symlinks to standardized locations + +This is the DAG you trigger from the Airflow UI to bootstrap a pipeline run. + +### Inputs + +Airflow UI params / configuration (conceptually): + +* destination selector (where to save products) +* paths for source/background/orientation/response (or folders from which they can be resolved) +* optional “date/selection policy” style filters + +### Outputs + +* standardized run folder (products directory) +* staged inputs (symlinked or copied, depending on logic) +* background cut output (if enabled) + +### Number of tasks + +**6 tasks** + +* `prepare_raw_dirs` +* `resolve_config` +* `stage_all_files` +* `create_products_dir` +* `background_cut` +* `create_symlinks` + +### Operators used + +* `PythonOperator` +* `ExternalPythonOperator` (used for scientific steps executed in the `cosipy` conda env) + +### XCom usage + +✅ **Yes** +Used to propagate resolved configuration and output paths between tasks. + +--- + +# Scientific COSIDAG pipelines + +## `cosidag_tsmap` + +### File + +`cosidag_tsmap.py` + +### Type + +**COSIDAG** + +### Workflow purpose + +TS Map computation pipeline (binned GRB + background → TS map products). + +### Inputs + +Resolved via COSIDAG `file_patterns` (search under detected folder): + +* `grb_file`: `GRB*_unbinned_*.fits*` +* `background_file`: `Total_BG*_unbinned_*.fits*` +* `orientation_file`: `*.ori` +* `response_file`: `Response*.h5` + +Plus COSIDAG parameters (monitoring folders, date queries, selection policy, etc.). + +### Outputs + +* binned GRB file +* binned background file +* TS map products (standard and multi-resolution) +* results stored in the run/products directory used by COSIDAG + +### Number of tasks + +**~8 total** (COSIDAG base tasks + custom tasks) + +**Base COSIDAG tasks (created by framework):** + +* `check_new_file` +* `automatic_retrig` *(depends on config)* +* `resolve_inputs` +* `show_results` + +**Custom tasks in this pipeline:** + +* `bin_grb_source` +* `bin_background` +* `ts_map_computation` +* `ts_map_mulres_computation` + +### Operators used + +* `ExternalPythonOperator` (science steps) +* plus COSIDAG internal operators (see `cosidag.py`) + +### XCom usage + +✅ **Yes** + +* COSIDAG publishes resolved input paths (`resolve_inputs`) +* custom tasks read paths using templated `ti.xcom_pull(...)` (return_value pattern is used) + +--- + +## `cosidag_lcurve` + +### File + +`cosidag_lcurve.py` + +### Type + +**COSIDAG** + +### Workflow purpose + +Light Curve plotting pipeline (binned GRB + background → light curve products). + +### Inputs + +Resolved via COSIDAG `file_patterns`: + +* `grb_file`: `GRB*_unbinned_*.fits*` +* `background_file`: `Total_BG*_unbinned_*.fits*` +* `orientation_file`: `*.ori` +* `response_file`: `Response*.h5` + +### Outputs + +* binned GRB file +* binned background file +* light curve plot(s) / products saved into the pipeline output folder + +### Number of tasks + +**~7 total** (COSIDAG base tasks + custom tasks) + +**Base COSIDAG tasks:** + +* `check_new_file` +* `automatic_retrig` *(enabled in config in this file)* +* `resolve_inputs` +* `show_results` + +**Custom tasks in this pipeline:** + +* `bin_grb_source` +* `bin_background` +* `plot_lightcurve` + +### Operators used + +* `ExternalPythonOperator` +* plus COSIDAG internal operators + +### XCom usage + +✅ **Yes** +Uses XCom for: + +* detected folder +* resolved inputs +* passing file paths between binning and plotting + +--- + +# Tutorial / example COSIDAGs + +## `cosidag_example` + +### File + +`cosidag_example.py` + +### Type + +**COSIDAG (example)** + +### Workflow purpose + +Demonstrates how to attach a custom task to a COSIDAG and how to: + +* consume `detected_folder` via XCom +* search files inside the detected folder via helper `dag.find_file_by_pattern(...)` + +### Inputs + +* `monitoring_folders` points to a sample location (example uses `/home/gamma/workspace/data/tsmap`) +* COSIDAG detection parameters (depth, date queries, etc.) + +### Outputs + +* logs + demonstration of resolved file path (printed) +* depends on your custom implementation + +### Number of tasks + +**~5–6 total** (COSIDAG base tasks + 1 custom task) +Custom task: + +* `custom_process` + +### Operators used + +* `PythonOperator` +* plus COSIDAG internal operators + +### XCom usage + +✅ **Yes** (pulls `detected_folder`) + +--- + +## `cosidag_helloworld` + +### File + +`cosidag_helloworld.py` + +### Type + +**COSIDAG (minimal tutorial)** + +### Workflow purpose + +Minimal “hello world” COSIDAG showing: + +* how to define `build_custom(dag)` +* how to chain a single custom task +* how to run even with `monitoring_folders=None` (no folder detection) + +### Inputs + +None (demo-style). In this file `monitoring_folders=None` and `auto_retrig=False`. + +### Outputs + +Logs only. + +### Number of tasks + +**~2–3 total** + +* COSIDAG “detected folder setter” task may exist when monitoring is disabled (implementation detail) +* custom task: `hello_world` + +### Operators used + +* `PythonOperator` +* `BashOperator` +* plus COSIDAG internal operators + +### XCom usage + +✅ **Yes** (used to keep the COSIDAG contract consistent, even in no-monitoring mode) + +--- + +## `cosidag_tutorial_a_svd` + +### File + +`cosidag_a.py` + +### Type + +**COSIDAG (tutorial A)** + +### Workflow purpose + +Tutorial A: + +* build a binary text matrix +* factorize via SVD +* save factor outputs and diagnostic plots + +### Inputs + +* demo parameters (TEXT, SIZE, FONT_SIZE, RANK) +* output base directory: `/home/gamma/workspace/data/tutorials/a_b_factor` + +### Outputs + +In `BASE_DIR`: + +* `factors.pkl` +* `factor_L.png` +* `factor_R.png` + (and any other artifacts generated by the tutorial) + +### Number of tasks + +**~3–4 total** + +* COSIDAG “set detected folder” step (since this tutorial doesn’t rely on monitoring) +* custom task: + + * `a_factorize_text_matrix` + +### Operators used + +* `PythonOperator` (setup / set folder) +* `ExternalPythonOperator` (SVD + plots in external env) + +### XCom usage + +✅ **Yes** +Uses XCom to propagate base folder / run folder into the external step. + +--- + +## `cosidag_tutorial_b_reconstruct` + +### File + +`cosidag_b.py` + +### Type + +**COSIDAG (tutorial B)** + +### Workflow purpose + +Tutorial B: + +* load SVD factors generated in tutorial A +* reconstruct the matrix (float + binarized) +* save reconstruction plots + +### Inputs + +From `BASE_DIR`: + +* `factors.pkl` + Parameters: +* `bin_thr` threshold + +### Outputs + +In `BASE_DIR`: + +* `reconstruction_float.png` +* `reconstruction_binary.png` + +### Number of tasks + +**~3–4 total** + +* COSIDAG “set detected folder” step (no monitoring) +* custom task: + + * `b_reconstruct_and_plot` + +### Operators used + +* `PythonOperator` +* `ExternalPythonOperator` + +### XCom usage + +✅ **Yes** +Folder/path handoff via XCom. + +--- + +# Test / utility DAGs + +## `dag_parallel_test_1` + +### File + +`dag_parallel_test_1.py` + +### Type + +**Standard DAG (test)** + +### Workflow purpose + +Simple parallelism test: two independent sleep tasks. + +### Inputs / Outputs + +None (logs only). + +### Number of tasks + +**2** + +* `sleep_a` +* `sleep_b` + +### Operators used + +* `BashOperator` + +### XCom usage + +❌ No + +--- + +## `dag_parallel_test_2` + +### File + +`dag_parallel_test_2.py` + +### Type + +**Standard DAG (test)** + +### Workflow purpose + +Second parallelism test DAG. + +### Inputs / Outputs + +None (logs only). + +### Number of tasks + +**2** + +* `sleep_c` +* `sleep_d` + +### Operators used + +* `BashOperator` + +### XCom usage + +❌ No + +--- + +## `dag_with_email_alert` + +### File + +`fail_task.py` + +### Type + +**Standard DAG (test / failure path)** + +### Workflow purpose + +Intentional failure DAG used to test: + +* failure handling +* alerting / callbacks (depending on Airflow configuration) + +### Inputs / Outputs + +None (it fails on purpose). + +### Number of tasks + +**1** + +* `failing_task` + +### Operators used + +* `PythonOperator` + +### XCom usage + +❌ No + +--- + +# Summary table + +| DAG ID / Name | Type | Purpose (short) | Tasks (approx) | Operators (main) | XCom | +| -------------------------------- | ------- | --------------------------------------- | -------------- | --------------------------------------- | ---- | +| `init_pipelines` | DAG | stage/init run + background cut + links | 6 | PythonOperator, ExternalPythonOperator | Yes | +| `cosidag_tsmap` | COSIDAG | TS map products | ~8 | ExternalPythonOperator (+ COSIDAG core) | Yes | +| `cosidag_lcurve` | COSIDAG | light curve products | ~7 | ExternalPythonOperator (+ COSIDAG core) | Yes | +| `cosidag_example` | COSIDAG | example: detected_folder + file search | ~5–6 | PythonOperator (+ COSIDAG core) | Yes | +| `cosidag_helloworld` | COSIDAG | minimal tutorial | ~2–3 | PythonOperator, BashOperator | Yes | +| `cosidag_tutorial_a_svd` | COSIDAG | SVD factorization tutorial | ~3–4 | PythonOperator, ExternalPythonOperator | Yes | +| `cosidag_tutorial_b_reconstruct` | COSIDAG | reconstruction tutorial | ~3–4 | PythonOperator, ExternalPythonOperator | Yes | +| `dag_parallel_test_1` | DAG | parallelism test | 2 | BashOperator | No | +| `dag_parallel_test_2` | DAG | parallelism test | 2 | BashOperator | No | +| `dag_with_email_alert` | DAG | intentional failure / alert test | 1 | PythonOperator | No | diff --git a/dags/cosidag_a.py b/dags/cosidag_a.py new file mode 100644 index 0000000..fda1c39 --- /dev/null +++ b/dags/cosidag_a.py @@ -0,0 +1,194 @@ +# cosidag_tutorial_a.py +from datetime import datetime +import sys + +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from airflow.operators.python import PythonOperator, ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +TEXT = "DAGs\n ARE\nCOOL!" +SIZE = [48, 48] +FONT_SIZE = 6 +RANK = 12 +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" + + +def build_custom(dag): + """ + Tutorial A: + build a binary text matrix, factorize via SVD, save A/B and plots. + """ + + # ------------------------------------------------- + # 0) Declare the result folder for COSIDAG + # ------------------------------------------------- + def _set_detected_folder(ti): + ti.xcom_push(key="detected_folder", value=BASE_DIR) + return BASE_DIR + + set_detected_folder = PythonOperator( + task_id="set_detected_folder", + python_callable=_set_detected_folder, + dag=dag, + ) + + # ------------------------------------------------- + # 1) Factorize text matrix (external env) + # ------------------------------------------------- + def _a_make_factors(base_dir: str, text: str, size: list, font_size: int, rank: int): + """Run entirely in the external 'cosipy' interpreter. + Robustly measure multiline text size across Pillow versions (no draw.textsize). + """ + from pathlib import Path + import pickle + import numpy as np + import matplotlib + matplotlib.use("Agg") # safe non-interactive backend + import matplotlib.pyplot as plt + from PIL import Image, ImageDraw, ImageFont + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + pkl_path = base / "factors.pkl" + img_L = base / "factor_L.png" + img_R = base / "factor_R.png" + + W, H = int(size[0]), int(size[1]) + + # -- Load a mono font if available, otherwise default fallback + try: + font = ImageFont.truetype("DejaVuSansMono.ttf", font_size) + except Exception: + font = ImageFont.load_default() + + # -- Helper: robust multiline text bounding box across Pillow versions + def measure_multiline(draw: ImageDraw.ImageDraw, txt: str, font: ImageFont.ImageFont): + """Return (w, h) for multiline text. Tries modern APIs first, falls back gracefully.""" + if hasattr(draw, "multiline_textbbox"): + left, top, right, bottom = draw.multiline_textbbox((0, 0), txt, font=font, align="center") + return (right - left, bottom - top) + if hasattr(draw, "textbbox"): + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + if line == "": + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + widths.append(0) + heights.append(lh) + else: + l, t, r, b = draw.textbbox((0, 0), line, font=font) + widths.append(r - l) + heights.append(b - t) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + # Fallback + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + try: + w_line = draw.textlength(line, font=font) + except Exception: + w_line = max(1, int(len(line) * font.size * 0.6)) + widths.append(int(w_line)) + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + heights.append(lh) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + + # -- 1) Render text -> binary matrix (0 white, 1 black) + img = Image.new("L", (W, H), color=255) + draw = ImageDraw.Draw(img) + + w, h = measure_multiline(draw, text, font) + x = (W - w) // 2 + y = (H - h) // 2 + + if hasattr(draw, "multiline_text"): + draw.multiline_text((x, y), text, fill=0, font=font, align="center") + else: + lines = text.splitlines() or [text] + cur_y = y + for line in lines: + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + draw.text((x, cur_y), line, fill=0, font=font) + cur_y += lh + + arr = np.array(img) + X = (arr < 128).astype(float) # binary 0/1 as float + + # -- 2) SVD factorization: X ≈ (U_k sqrt(S)) (sqrt(S) V_k^T) + U, s, Vt = np.linalg.svd(X, full_matrices=False) + k = max(1, min(int(rank), len(s))) + Uk = U[:, :k] + Sk = np.diag(s[:k]) + Vk = Vt[:k, :] + Ssqrt = np.sqrt(Sk) + L = Uk @ Ssqrt + R = Ssqrt @ Vk + + # -- 3) Persist factors + with open(pkl_path, "wb") as f: + pickle.dump( + { + "L": L.astype("float32"), + "R": R.astype("float32"), + "meta": {"rank": int(k), "size": [W, H], "text": text}, + }, + f, + ) + + # -- 4) Visualize L and R (not binary) + def _plot_matrix(M, out_path, title): + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title(title) + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(out_path) + plt.close() + + _plot_matrix(L, img_L, f"L factor ({W}×{k})") + _plot_matrix(R, img_R, f"R factor ({k}×{H})") + + a_factorize = ExternalPythonOperator( + task_id="a_factorize_text_matrix", + python=EXTERNAL_PYTHON, + python_callable=_a_make_factors, # IDENTICA al tutorial originale + op_kwargs={ + "base_dir": BASE_DIR, + "text": TEXT, + "size": SIZE, + "font_size": FONT_SIZE, + "rank": RANK, + }, + dag=dag, + ) + + set_detected_folder >> a_factorize + + +with COSIDAG( + dag_id="cosidag_tutorial_a_svd", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=None, + auto_retrig=False, + build_custom=build_custom, + tags=["cosidag", "tutorial", "external-python"], +) as dag: + pass diff --git a/dags/cosidag_b.py b/dags/cosidag_b.py new file mode 100644 index 0000000..c141172 --- /dev/null +++ b/dags/cosidag_b.py @@ -0,0 +1,104 @@ +# cosidag_tutorial_b.py +from datetime import datetime +import sys + +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from airflow.operators.python import PythonOperator, ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" +PKL_PATH = f"{BASE_DIR}/factors.pkl" +BIN_THR = 0.5 + + +def build_custom(dag): + """ + Tutorial B: + reconstruct matrix from A/B factors and produce plots. + """ + + # ------------------------------------------------- + # 0) Declare the result folder (same as A) + # ------------------------------------------------- + def _set_detected_folder(ti): + ti.xcom_push(key="detected_folder", value=BASE_DIR) + return BASE_DIR + + set_detected_folder = PythonOperator( + task_id="set_detected_folder", + python_callable=_set_detected_folder, + dag=dag, + ) + + # ------------------------------------------------- + # 1) Reconstruction + plots (external env) + # ------------------------------------------------- + def _b_reconstruct_and_plot(base_dir: str, pkl_path: str, bin_thr: float): + """Run in external interpreter. Load L,R -> M=L@R; save float & binarized reconstructions.""" + from pathlib import Path + import pickle + import numpy as np + import matplotlib.pyplot as plt + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + img_rec_float = base / "reconstruction_float.png" + img_rec_bin = base / "reconstruction_binary.png" + + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + + L = np.asarray(payload["L"], dtype=float) # (32×k) + R = np.asarray(payload["R"], dtype=float) # (k×32) + + # 1) Reconstruct + M = L @ R + + # 2) Save float heatmap + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title("Reconstruction (float)") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_float) + plt.close() + + # 3) Save binarized heatmap (to match Alice's binary look) + M_bin = (M >= bin_thr).astype(int) + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M_bin, cmap="gray_r", interpolation="nearest") + plt.title(f"Reconstruction (binary, thr={bin_thr})") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_bin) + plt.close() + + b_reconstruct = ExternalPythonOperator( + task_id="b_reconstruct_and_plot", + python=EXTERNAL_PYTHON, + python_callable=_b_reconstruct_and_plot, # IDENTICA al tutorial originale + op_kwargs={ + "base_dir": BASE_DIR, + "pkl_path": PKL_PATH, + "bin_thr": BIN_THR, + }, + dag=dag, + ) + + set_detected_folder >> b_reconstruct + + +with COSIDAG( + dag_id="cosidag_tutorial_b_reconstruct", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=None, + auto_retrig=False, + build_custom=build_custom, + tags=["cosidag", "tutorial", "reconstruction", "external-python"], +) as dag: + pass diff --git a/dags/cosidag_example.py b/dags/cosidag_example.py new file mode 100644 index 0000000..07298de --- /dev/null +++ b/dags/cosidag_example.py @@ -0,0 +1,41 @@ +from datetime import datetime +# add the path to the cosiflow module +import sys +sys.path.append("/home/gamma/airflow/modules") +from cosidag import COSIDAG +from airflow.operators.python import PythonOperator +# +def build_custom(dag): + # Example custom task consuming the detected folder via XCom + def _process_folder(folder_path: str): + # Do your science here + print(f"Processing folder: {folder_path}") + # search for the file by pattern + file_path = dag.find_file_by_pattern(r".*\.fits.*", folder_path) + print(f"Found file: {file_path}") +# + PythonOperator( + task_id="custom_process", + python_callable=lambda ti, **_: _process_folder( + ti.xcom_pull(task_ids="check_new_file", key="detected_folder") + ), + dag=dag, + ) +# +with COSIDAG( + dag_id="cosidag_example", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=["/home/gamma/workspace/data/tsmap"], + level=3, + date_queries=f"=={datetime.now().strftime('%Y%m%d')}", + build_custom=build_custom, + idle_seconds=5, + min_files=1, + ready_marker=None, + only_basename="products", + tags=["cosidag", "example"], + #ready_marker="_SUCCESS", +) as dag: + pass diff --git a/dags/cosidag_helloworld.py b/dags/cosidag_helloworld.py new file mode 100644 index 0000000..89d4645 --- /dev/null +++ b/dags/cosidag_helloworld.py @@ -0,0 +1,53 @@ +from datetime import datetime +from pydoc import describe +import sys + +# Add cosiflow modules path +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from airflow.operators.bash import BashOperator +from airflow.operators.python import PythonOperator + + +def build_custom(dag): + """ + Minimal COSIDAG example. + Shows how to attach a single custom task to the COSIDAG lifecycle. + """ + + def _set_detected_folder(ti): + ti.xcom_push( + key="detected_folder", + value="/home/gamma/workspace/data/tutorials" + ) + return "/home/gamma/workspace/data/tutorials" + + set_detected_folder = PythonOperator( + task_id="set_detected_folder", + # Set the detected folder for the DAG, it is used by the COSIDAG to the folder for the show_results task. + # This task is not part of the COSIDAG workflow, it is used to set the detected folder for the DAG. + python_callable=_set_detected_folder, + dag=dag, + ) + + hello_world = BashOperator( + task_id="hello_world", + bash_command="echo 'Hello from COSIDAG' > /home/gamma/workspace/data/tutorials/hello_world.txt", + dag=dag, + ) + + set_detected_folder >> hello_world + +with COSIDAG( + dag_id="cosidag_helloworld", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=None, # dummy folder for demo + auto_retrig=False, + level=0, + build_custom=build_custom, + tags=["cosidag", "example", "helloworld", "tutorial"], +) as dag: + pass diff --git a/dags/cosipipe_cosipy.py b/dags/cosipipe_cosipy.py deleted file mode 100644 index db87c80..0000000 --- a/dags/cosipipe_cosipy.py +++ /dev/null @@ -1,148 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.operators.bash_operator import BashOperator -import os -import time -import datetime -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from airflow.operators.dagrun_operator import TriggerDagRunOperator - -# Import necessary Airflow classes and standard libraries - -# Define a data pipeline class for monitoring, ingesting, and storing DL0 files -class DataPipeline: - def __init__(self): - # Define directory paths for input, processed data (heasarc), and logs - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - # Set up inotify to watch the input directory for file-close-write events - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Configure logger with both file rotation and console output - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - # Avoid duplicate logger handlers - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - # Monitor input directory for new files and return the oldest file when available - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - self.logger.info("Daemon process started for continuous file monitoring...") - - # Start infinite polling loop to check for input files - while True: - input_directory = os.path.join(self.base_dir, 'input') - input_files = os.listdir(input_directory) - - # Check if there are any files - if input_files: - # Find and return the path to the oldest file in the input directory - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if os.path.exists(oldest_file): - # Log and push to XCom - self.logger.info(f"New file detected: {oldest_file}") - # Push file path to XCom for downstream tasks - ti.xcom_push(key='new_file_path', value=oldest_file) - # Allow subsequent tasks to run - return True - - # Sleep between checks to reduce CPU usage - time.sleep(5) - - # Move detected input file into a timestamped subdirectory inside heasarc - # Store and push the new path for downstream tasks - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - # Retrieve the input file path from XCom - input_files = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file_sensor_task') - if input_files: - # Check that the file exists and move it into a new timestamped subfolder - if not os.path.exists(input_files): - raise FileNotFoundError(f"Input file {input_files} does not exist.") - self.logger.info(f"Processing DL0 file: {input_files}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - timestamp_utc = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d_%H-%M-%S') - new_dir = f'{self.heasarc_dir}/dl0/{timestamp_utc}' - os.makedirs(new_dir, exist_ok=True) - stored_file_path = f"{new_dir}/{os.path.basename(input_files)}" - os.rename(input_files, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - # Push the new file path to XCom for further use - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - # Handle missing file or other unexpected exceptions gracefully - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - -pipeline = DataPipeline() - -# Define the Airflow DAG to orchestrate DL0 file monitoring, ingestion, and plotting -with DAG('cosipy_test_v0', default_args={'owner': 'airflow'}, schedule=None, - max_active_tasks=5, # Maximum number of tasks that can be executed simultaneously per DAG - max_active_runs=4 # Maximum number of DAG instances that can be executed simultaneously - ) as dag: - - # Task to detect the arrival of new files in the input directory - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - dag=dag - ) - - # Task to move and organize the newly detected file - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - ) - - # Task to generate plots using an external script in the cosipy environment - trigger_next_run = TriggerDagRunOperator( - task_id="trigger_next_run", - trigger_dag_id="cosipy_test_v0", # Stesso DAG - dag=dag, - ) - - # Task to trigger the same DAG again for continuous processing - generate_plots = BashOperator( - task_id='generate_plots', - bash_command=""" - source activate cosipy && - python /shared_dir/pipeline/generate_plot.py "{{ task_instance.xcom_pull(task_ids='ingest_and_store_dl0_sensor', key='stored_dl0_file') }}" - """, - dag=dag, - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_plots >> trigger_next_run diff --git a/dags/cosipipe_cosipy_external_python.py b/dags/cosipipe_cosipy_external_python.py deleted file mode 100644 index b103318..0000000 --- a/dags/cosipipe_cosipy_external_python.py +++ /dev/null @@ -1,185 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator, ExternalPythonOperator -from airflow.operators.bash_operator import BashOperator -import os -import time -import datetime -import logging -from logging.handlers import RotatingFileHandler -from airflow.exceptions import AirflowSkipException -from airflow.operators.dagrun_operator import TriggerDagRunOperator -from airflow.decorators import task, dag - -# Import required modules and operators from Airflow and standard Python libraries - -# Define a class to encapsulate the data ingestion and logging logic -class DataPipeline: - def __init__(self): - # Set base directories for input, output, and logs - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - # Configure logger for both file and console output - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # Add rotating file handler to limit log file size - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Add console stream handler for real-time feedback - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Avoid adding handlers multiple times - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - # Continuously monitor the input directory for new files - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - self.logger.info("Daemon process started for continuous file monitoring...") - - while True: - # List files in the input directory - input_directory = os.path.join(self.base_dir, 'input') - input_files = os.listdir(input_directory) - - # Select the oldest file available - if input_files: - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if os.path.exists(oldest_file): - # Push file path to XCom for downstream tasks - self.logger.info(f"New file detected: {oldest_file}") - ti.xcom_push(key='new_file_path', value=oldest_file) - # Wait a short time before the next polling iteration - return True - - # Sleep before next check to avoid high CPU usage - time.sleep(5) - - # Move detected input file to a timestamped directory and store the path - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - # Retrieve file path from XCom - input_files = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file_sensor_task') - if input_files: - # Check if the file exists before proceeding - if not os.path.exists(input_files): - raise FileNotFoundError(f"Input file {input_files} does not exist.") - self.logger.info(f"Processing DL0 file: {input_files}") - # Create directory structure for storing the file - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - timestamp_utc = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d_%H-%M-%S') - new_dir = f'{self.heasarc_dir}/dl0/{timestamp_utc}' - os.makedirs(new_dir, exist_ok=True) - # Rename (move) the file to the new directory - stored_file_path = f"{new_dir}/{os.path.basename(input_files)}" - os.rename(input_files, stored_file_path) - # Push the new file path to XCom - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - -pipeline = DataPipeline() - -# Generate plots and summary data from the DL0 file using the cosipy library -def generate_plots_task(file_path): - - import sys, os - from cosipy.util import fetch_wasabi_file - from cosipy import BinnedData - from pathlib import Path - - # Define the directory and create the input YAML configuration file - print("test") - print(file_path) - dir_name = os.path.dirname(file_path) - - content_to_write = f"""#----------# - # Data I/O: - - # data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 - data_file: {file_path} # full path - ori_file: "NA" # full path - unbinned_output: 'hdf5' # 'fits' or 'hdf5' - time_bins: 60 # time bin size in seconds. Takes int, float, or list of bin edges. - energy_bins: [100., 200., 500., 1000., 2000., 5000.] # Takes list. Needs to match response. - phi_pix_size: 6 # binning of Compton scattering anlge [deg] - nside: 8 # healpix binning of psi chi local - scheme: 'ring' # healpix binning of psi chi local - tmin: 1835478000.0 # Min time cut in seconds. - tmax: 1835485200.0 # Max time cut in seconds. - #----------# - """ - - dir_name_path = Path(dir_name) - - # Open the file in write mode and write the content - with open(dir_name_path / "inputs.yaml", "w") as file: - file.write(content_to_write) - - # Run analysis steps: read .tra file, bin data, create spectrum and light curve - analysis = BinnedData(dir_name_path / "inputs.yaml") - analysis.read_tra(output_name=dir_name_path / "unbinned_data") - analysis.get_binned_data() - analysis.get_raw_spectrum(output_name=file_path.replace(".crab2hr.extracted.tra.gz", "")) - analysis.get_raw_lightcurve(output_name=file_path.replace(".crab2hr.extracted.tra.gz", "")) - - -# Define the DAG and the task pipeline for DL0 processing and plotting -with DAG('cosipy_external_python_v2', default_args={'owner': 'airflow'}, schedule=None, - #start_date=datetime.now(), - max_active_tasks=5, # Maximum number of tasks that can be executed simultaneously per DAG - max_active_runs=4 # Maximum number of DAG instances that can be executed simultaneously - ) as dag: - - # Wait for new file to appear in input directory - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - dag=dag - ) - - # Move the file and store it in the appropriate location - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - ) - - # Trigger the same DAG again to run continuously - trigger_next_run = TriggerDagRunOperator( - task_id="trigger_next_run", - trigger_dag_id="cosipy_external_python_v2", - dag=dag, - ) - - # Run the plot generation script in an external Python environment - generate_plots = ExternalPythonOperator( - task_id='generate_plots', - python_callable=generate_plots_task, - python="/home/gamma/.conda/envs/cosipy/bin/python", - op_args=["{{ task_instance.xcom_pull(task_ids='ingest_and_store_dl0_sensor', key='stored_dl0_file') }}"], - dag=dag, - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_plots >> trigger_next_run \ No newline at end of file diff --git a/dags/cosipipev0.py b/dags/cosipipev0.py deleted file mode 100644 index af82ab3..0000000 --- a/dags/cosipipev0.py +++ /dev/null @@ -1,236 +0,0 @@ -from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago -import os -import time -import csv -import random -import logging -from datetime import datetime, timedelta -from airflow.exceptions import AirflowSkipException - -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - current_time = datetime.now().strftime("%Y%m%d_%H%M%S_%f") - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}_{current_time}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - dl0_file = kwargs['ti'].xcom_pull(key='stored_dl0_file', task_ids='ingest_and_store_dl0') - if dl0_file: - self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - - def generate_dl1b(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1a') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1a/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1b') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1b/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1c') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1c/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl2', 'dl2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_1') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_1/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_three(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self): - input_files = os.listdir(f'{self.base_dir}/input') - self.logger.info(f"Checking for new files. Current files: {input_files}") - return bool(input_files) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline', default_args={'owner': 'airflow'}, schedule_interval=None, - start_date=datetime.now(), - concurrency=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - wait_for_new_file = PythonSensor( - task_id='wait_for_new_file', - python_callable=pipeline.check_new_file, - poke_interval=1, - timeout=600 - ) - - ingest_and_store_dl0_task = PythonOperator( - task_id='ingest_and_store_dl0', - python_callable=pipeline.ingest_and_store_dl0, - provide_context=True - ) - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - notify_completion_task = PythonOperator( - task_id='notify_completion', - python_callable=pipeline.notify_completion - ) - - wait_for_new_file >> ingest_and_store_dl0_task >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> fast_transient_stage_one_task >> fast_transient_stage_two_task >> fast_transient_stage_three_task >> notify_completion_task - diff --git a/dags/cosipipev1.py b/dags/cosipipev1.py deleted file mode 100644 index 86a098f..0000000 --- a/dags/cosipipev1.py +++ /dev/null @@ -1,235 +0,0 @@ -from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago -import os -import time -import csv -import random -import logging -from datetime import datetime, timedelta -from airflow.exceptions import AirflowSkipException - -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - current_time = datetime.now().strftime("%Y%m%d_%H%M%S_%f") - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}_{current_time}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - dl0_file = kwargs['ti'].xcom_pull(key='stored_dl0_file', task_ids='ingest_and_store_dl0') - if dl0_file: - self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - - def generate_dl1b(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1a') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1a/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1b') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1b/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1c') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1c/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl2', 'dl2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_1') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_1/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_three(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self): - input_files = os.listdir(f'{self.base_dir}/input') - self.logger.info(f"Checking for new files. Current files: {input_files}") - return bool(input_files) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v1', default_args={'owner': 'airflow'}, schedule_interval=None, - start_date=datetime.now(), - concurrency=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - wait_for_new_file = PythonSensor( - task_id='wait_for_new_file', - python_callable=pipeline.check_new_file, - poke_interval=1, - timeout=600 - ) - - ingest_and_store_dl0_task = PythonOperator( - task_id='ingest_and_store_dl0', - python_callable=pipeline.ingest_and_store_dl0, - provide_context=True - ) - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - notify_completion_task = PythonOperator( - task_id='notify_completion', - python_callable=pipeline.notify_completion - ) - - wait_for_new_file >> ingest_and_store_dl0_task >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> fast_transient_stage_one_task >> fast_transient_stage_two_task >> fast_transient_stage_three_task >> notify_completion_task diff --git a/dags/cosipipev2.py b/dags/cosipipev2.py deleted file mode 100644 index 3caef7d..0000000 --- a/dags/cosipipev2.py +++ /dev/null @@ -1,362 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago, timezone -import os -import time -import csv -import random -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from datetime import datetime, timedelta - -#AIRFLOW -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Logger setup for both Celery and the pipeline - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - new_file_path = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file') - if new_file_path: - if not os.path.exists(new_file_path): - raise FileNotFoundError(f"Input file {new_file_path} does not exist.") - self.logger.info(f"Oldest DL0 file: {new_file_path}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - stored_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(new_file_path)}" - os.rename(new_file_path, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - try: - ti = kwargs['ti'] - dl0_file = ti.xcom_pull(key='stored_dl0_file') - if dl0_file: - if not os.path.exists(dl0_file): - raise FileNotFoundError(f"DL0 file {dl0_file} does not exist. It may have been processed by another instance.") - filename = self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - ti.xcom_push(key='stored_dl1a_file', value=filename) - else: - self.logger.warning("No DL0 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL0 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1b(self, **kwargs): - try: - ti = kwargs['ti'] - dl1a_file = ti.xcom_pull(key='stored_dl1a_file', task_ids='generate_dl1a') - if dl1a_file: - if not os.path.exists(dl1a_file): - raise FileNotFoundError(f"DL1a file {dl1a_file} does not exist.") - filename = self.generate_placeholder_file(dl1a_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - ti.xcom_push(key='stored_dl1b_file', value=filename) - else: - self.logger.warning("No DL1a file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1a file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self, **kwargs): - try: - ti = kwargs['ti'] - dl1b_file = ti.xcom_pull(key='stored_dl1b_file', task_ids='generate_dl1b') - if dl1b_file: - if not os.path.exists(dl1b_file): - raise FileNotFoundError(f"DL1b file {dl1b_file} does not exist.") - filename = self.generate_placeholder_file(dl1b_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - ti.xcom_push(key='stored_dl1c_file', value=filename) - else: - self.logger.warning("No DL1b file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1b file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self, **kwargs): - try: - ti = kwargs['ti'] - dl1c_file = ti.xcom_pull(key='stored_dl1c_file', task_ids='generate_dl1c') - if dl1c_file: - if not os.path.exists(dl1c_file): - raise FileNotFoundError(f"DL1c file {dl1c_file} does not exist.") - filename = self.generate_placeholder_file(dl1c_file, f'{self.heasarc_dir}/dl2', 'dl2') - ti.xcom_push(key='stored_dl2_file', value=filename) - else: - self.logger.warning("No DL1c file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1c file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - ti.xcom_push(key='stored_stage1_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - ti.xcom_push(key='stored_stage2_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - - def fast_transient_stage_three(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = ti.xcom_pull(key='stored_stage2_file', task_ids='fast_transient_stage_two') - if not os.path.exists(input_files): - raise FileNotFoundError(f"stage 2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(input_files, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): # Wait for 1 second for an event - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - file_path = f"{self.base_dir}/input/{event.name}" - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=file_path) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=event.name) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file_sensor(self): - input_files = os.listdir(f'{self.base_dir}/input') - self.logger.info(f"Checking for new files. Current files: {input_files}") - return bool(input_files) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v2', default_args={'owner': 'airflow'}, schedule=None, - start_date=datetime.now(), - max_active_tasks=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - #wait_for_new_file = PythonOperator( - # task_id='wait_for_new_file', - # python_callable=pipeline.check_new_file, - # provide_context=True - #) - - # ingest_and_store_dl0_task = PythonOperator( - # task_id='ingest_and_store_dl0', - # python_callable=pipeline.ingest_and_store_dl0, - # provide_context=True - # ) - - wait_for_new_file_sensor = PythonSensor( - task_id='wait_for_new_file_sensor', - python_callable=pipeline.check_new_file_sensor, - poke_interval=1, - timeout=600 - ) - - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - provide_context=True - ) - - - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - wait_for_new_file_sensor >> ingest_and_store_dl0_task_sensor >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> [fast_transient_stage_one_task, fast_transient_stage_two_task] - fast_transient_stage_two_task >> fast_transient_stage_three_task diff --git a/dags/cosipipev3.py b/dags/cosipipev3.py deleted file mode 100644 index 2a51cd6..0000000 --- a/dags/cosipipev3.py +++ /dev/null @@ -1,394 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago, timezone -import os -import time -import csv -import random -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from datetime import datetime, timedelta -from airflow.operators.dagrun_operator import TriggerDagRunOperator - -#airflow dags trigger cosi_data_analysis_pipeline_v3 -#airflow dags list-runs -d cosi_data_analysis_pipeline_v3 --state running - - -#AIRFLOW -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Logger setup for both Celery and the pipeline - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - new_file_path = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file') - if new_file_path: - if not os.path.exists(new_file_path): - raise FileNotFoundError(f"Input file {new_file_path} does not exist.") - self.logger.info(f"Oldest DL0 file: {new_file_path}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - stored_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(new_file_path)}" - os.rename(new_file_path, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - try: - ti = kwargs['ti'] - dl0_file = ti.xcom_pull(key='stored_dl0_file') - if dl0_file: - if not os.path.exists(dl0_file): - raise FileNotFoundError(f"DL0 file {dl0_file} does not exist. It may have been processed by another instance.") - filename = self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - ti.xcom_push(key='stored_dl1a_file', value=filename) - else: - self.logger.warning("No DL0 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL0 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1b(self, **kwargs): - try: - ti = kwargs['ti'] - dl1a_file = ti.xcom_pull(key='stored_dl1a_file', task_ids='generate_dl1a') - if dl1a_file: - if not os.path.exists(dl1a_file): - raise FileNotFoundError(f"DL1a file {dl1a_file} does not exist.") - filename = self.generate_placeholder_file(dl1a_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - ti.xcom_push(key='stored_dl1b_file', value=filename) - else: - self.logger.warning("No DL1a file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1a file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self, **kwargs): - try: - ti = kwargs['ti'] - dl1b_file = ti.xcom_pull(key='stored_dl1b_file', task_ids='generate_dl1b') - if dl1b_file: - if not os.path.exists(dl1b_file): - raise FileNotFoundError(f"DL1b file {dl1b_file} does not exist.") - filename = self.generate_placeholder_file(dl1b_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - ti.xcom_push(key='stored_dl1c_file', value=filename) - else: - self.logger.warning("No DL1b file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1b file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self, **kwargs): - try: - ti = kwargs['ti'] - dl1c_file = ti.xcom_pull(key='stored_dl1c_file', task_ids='generate_dl1c') - if dl1c_file: - if not os.path.exists(dl1c_file): - raise FileNotFoundError(f"DL1c file {dl1c_file} does not exist.") - filename = self.generate_placeholder_file(dl1c_file, f'{self.heasarc_dir}/dl2', 'dl2') - ti.xcom_push(key='stored_dl2_file', value=filename) - else: - self.logger.warning("No DL1c file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1c file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - ti.xcom_push(key='stored_stage1_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - ti.xcom_push(key='stored_stage2_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - - def fast_transient_stage_three(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = ti.xcom_pull(key='stored_stage2_file', task_ids='fast_transient_stage_two') - if not os.path.exists(input_files): - raise FileNotFoundError(f"stage 2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(input_files, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): # Wait for 1 second for an event - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - file_path = f"{self.base_dir}/input/{event.name}" - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=file_path) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=event.name) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - pipeline.logger.info("Daemon process started for continuous file monitoring...") - - while True: - input_files = os.listdir(f'{pipeline.base_dir}/input') - - # Check if there are any files - if input_files: - # Get the oldest file - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - - if os.path.exists(oldest_file): - # Log and push to XCom - pipeline.logger.info(f"New file detected: {oldest_file}") - ti.xcom_push(key='new_file_path', value=oldest_file) - - # Allow subsequent tasks to run - return True - - # Sleep before next check to avoid high CPU usage - time.sleep(5) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v3', default_args={'owner': 'airflow'}, schedule=None, - start_date=datetime.now(), - max_active_tasks=5, # Maximum number of tasks that can be executed simultaneously per DAG - max_active_runs=4 # Maximum number of DAG instances that can be executed simultaneously - ) as dag: - - #wait_for_new_file = PythonOperator( - # task_id='wait_for_new_file', - # python_callable=pipeline.check_new_file, - # provide_context=True - #) - - # ingest_and_store_dl0_task = PythonOperator( - # task_id='ingest_and_store_dl0', - # python_callable=pipeline.ingest_and_store_dl0, - # provide_context=True - # ) - - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - provide_context=True, - dag=dag - ) - - - - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - provide_context=True - ) - - - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - # Definisci il task per triggerare il DAG stesso - trigger_next_run = TriggerDagRunOperator( - task_id="trigger_next_run", - trigger_dag_id="cosi_data_analysis_pipeline_v3", # Stesso DAG - dag=dag, - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> [fast_transient_stage_one_task, fast_transient_stage_two_task] - fast_transient_stage_two_task >> fast_transient_stage_three_task >> trigger_next_run diff --git a/dags/cosipipev3o.py b/dags/cosipipev3o.py deleted file mode 100644 index 782836e..0000000 --- a/dags/cosipipev3o.py +++ /dev/null @@ -1,382 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago, timezone -import os -import time -import csv -import random -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from datetime import datetime, timedelta - -#AIRFLOW -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Logger setup for both Celery and the pipeline - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - new_file_path = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file') - if new_file_path: - if not os.path.exists(new_file_path): - raise FileNotFoundError(f"Input file {new_file_path} does not exist.") - self.logger.info(f"Oldest DL0 file: {new_file_path}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - stored_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(new_file_path)}" - os.rename(new_file_path, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - try: - ti = kwargs['ti'] - dl0_file = ti.xcom_pull(key='stored_dl0_file') - if dl0_file: - if not os.path.exists(dl0_file): - raise FileNotFoundError(f"DL0 file {dl0_file} does not exist. It may have been processed by another instance.") - filename = self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - ti.xcom_push(key='stored_dl1a_file', value=filename) - else: - self.logger.warning("No DL0 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL0 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1b(self, **kwargs): - try: - ti = kwargs['ti'] - dl1a_file = ti.xcom_pull(key='stored_dl1a_file', task_ids='generate_dl1a') - if dl1a_file: - if not os.path.exists(dl1a_file): - raise FileNotFoundError(f"DL1a file {dl1a_file} does not exist.") - filename = self.generate_placeholder_file(dl1a_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - ti.xcom_push(key='stored_dl1b_file', value=filename) - else: - self.logger.warning("No DL1a file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1a file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self, **kwargs): - try: - ti = kwargs['ti'] - dl1b_file = ti.xcom_pull(key='stored_dl1b_file', task_ids='generate_dl1b') - if dl1b_file: - if not os.path.exists(dl1b_file): - raise FileNotFoundError(f"DL1b file {dl1b_file} does not exist.") - filename = self.generate_placeholder_file(dl1b_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - ti.xcom_push(key='stored_dl1c_file', value=filename) - else: - self.logger.warning("No DL1b file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1b file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self, **kwargs): - try: - ti = kwargs['ti'] - dl1c_file = ti.xcom_pull(key='stored_dl1c_file', task_ids='generate_dl1c') - if dl1c_file: - if not os.path.exists(dl1c_file): - raise FileNotFoundError(f"DL1c file {dl1c_file} does not exist.") - filename = self.generate_placeholder_file(dl1c_file, f'{self.heasarc_dir}/dl2', 'dl2') - ti.xcom_push(key='stored_dl2_file', value=filename) - else: - self.logger.warning("No DL1c file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1c file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - ti.xcom_push(key='stored_stage1_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - ti.xcom_push(key='stored_stage2_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - - def fast_transient_stage_three(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = ti.xcom_pull(key='stored_stage2_file', task_ids='fast_transient_stage_two') - if not os.path.exists(input_files): - raise FileNotFoundError(f"stage 2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(input_files, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): # Wait for 1 second for an event - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - file_path = f"{self.base_dir}/input/{event.name}" - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=file_path) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=event.name) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - pipeline.logger.info("Daemon process started for continuous file monitoring...") - - while True: - input_files = os.listdir(f'{pipeline.base_dir}/input') - - # Check if there are any files - if input_files: - # Get the oldest file - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - - if os.path.exists(oldest_file): - # Log and push to XCom - pipeline.logger.info(f"New file detected: {oldest_file}") - ti.xcom_push(key='new_file_path', value=oldest_file) - - # Allow subsequent tasks to run - return True - - # Sleep before next check to avoid high CPU usage - time.sleep(5) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v3o', default_args={'owner': 'airflow'}, schedule=None, - start_date=datetime.now(), - max_active_tasks=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - #wait_for_new_file = PythonOperator( - # task_id='wait_for_new_file', - # python_callable=pipeline.check_new_file, - # provide_context=True - #) - - # ingest_and_store_dl0_task = PythonOperator( - # task_id='ingest_and_store_dl0', - # python_callable=pipeline.ingest_and_store_dl0, - # provide_context=True - # ) - - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - provide_context=True, - dag=dag - ) - - - - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - provide_context=True - ) - - - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> [fast_transient_stage_one_task, fast_transient_stage_two_task] - fast_transient_stage_two_task >> fast_transient_stage_three_task diff --git a/dags/dag_parallel_test_1.py b/dags/dag_parallel_test_1.py new file mode 100644 index 0000000..9da55aa --- /dev/null +++ b/dags/dag_parallel_test_1.py @@ -0,0 +1,16 @@ +from airflow import DAG +from airflow.operators.bash import BashOperator +from datetime import datetime + +with DAG( + dag_id="dag_parallel_test_1", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + max_active_runs=2, + concurrency=3, + tags=["test", "parallel"] +) as dag: + + BashOperator(task_id="sleep_a", bash_command="sleep 60") + BashOperator(task_id="sleep_b", bash_command="sleep 60") diff --git a/dags/dag_parallel_test_2.py b/dags/dag_parallel_test_2.py new file mode 100644 index 0000000..0daec63 --- /dev/null +++ b/dags/dag_parallel_test_2.py @@ -0,0 +1,16 @@ +from airflow import DAG +from airflow.operators.bash import BashOperator +from datetime import datetime + +with DAG( + dag_id="dag_parallel_test_2", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + max_active_runs=2, + concurrency=3, + tags=["test", "parallel"] +) as dag: + + BashOperator(task_id="sleep_c", bash_command="sleep 60") + BashOperator(task_id="sleep_d", bash_command="sleep 60") diff --git a/dags/fail_task.py b/dags/fail_task.py new file mode 100644 index 0000000..4fbf44f --- /dev/null +++ b/dags/fail_task.py @@ -0,0 +1,32 @@ +from airflow import DAG +from airflow.operators.python import PythonOperator +from datetime import datetime + +import sys +import os +airflow_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow") +sys.path.append(os.path.join(airflow_home, "callbacks")) +from on_failure_callback import notify_email + +def failing_task(): + raise ValueError("This task fails.") + +with DAG( + 'dag_with_email_alert', + default_args={ + 'owner': 'airflow', + 'depends_on_past': False, + 'email_on_failure': True, + 'email_on_retry': False, + 'on_failure_callback': notify_email, + 'retries': 0, + }, + schedule_interval=None, + start_date=datetime(2025, 1, 1), + catchup=False, + tags=["test", "failure", "email_alert"] +) as dag: + fail = PythonOperator( + task_id='failing_task', + python_callable=failing_task + ) diff --git a/dags/gendl0.py b/dags/gendl0.py deleted file mode 100644 index 91f7f9b..0000000 --- a/dags/gendl0.py +++ /dev/null @@ -1,54 +0,0 @@ -from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.utils.dates import days_ago -from datetime import timedelta -import os -import time -import csv -import random - -# Define a class to encapsulate data generation logic -class DataPipeline: - def __init__(self): - # Base directory for storing input data - self.base_dir = '/home/gamma/workspace/data' - - def generate_dl0_file(self): - # Create the input directory if it doesn't exist - input_directory = os.path.join(self.base_dir, 'input') - if not os.path.exists(input_directory): - os.makedirs(input_directory) - # Generate a filename using the current timestamp - filename = os.path.join(input_directory, f"dl0_{int(time.time())}.csv") - # Write random data to the CSV file - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - # Write header row - writer.writerow(["parameter1", "parameter2", "parameter3"]) - # Write 100 rows of random float values - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - -# Instantiate the data pipeline -pipeline = DataPipeline() - -# Define default arguments for the Airflow DAG -default_args = { - 'owner': 'airflow', - 'start_date': days_ago(1), - 'retries': 1, - 'retry_delay': timedelta(minutes=1), -} - -# Define a DAG that periodically generates DL0 data -with DAG( - 'generate_dl0_data', - default_args=default_args, - schedule_interval=timedelta(seconds=10), # Runs every 10 seconds - catchup=False, -) as generate_dl0_dag: - # Create a task that calls the generate_dl0_file function - generate_dl0_task = PythonOperator( - task_id='generate_dl0_file', - python_callable=pipeline.generate_dl0_file - ) diff --git a/env/Dockerfile b/env/Dockerfile deleted file mode 100644 index f6dac44..0000000 --- a/env/Dockerfile +++ /dev/null @@ -1,77 +0,0 @@ -FROM oraclelinux:8 AS oracle8 - -# ---------------------------------- Installing dependencies as root ---------------------------------- -RUN dnf install -y wget epel-release git cmake3 gcc-c++ gcc binutils \ -compat-openssl10 libX11-devel libXpm-devel libXft-devel libXext-devel \ -gsl-devel openssl-devel wget bzip2-devel libffi-devel xz-devel sqlite-devel \ -ncurses ncurses-devel make xz libzstd libzstd-devel which rsync \ -nmap-ncat chrony - -RUN dnf install -y oracle-epel-release-el8 -RUN dnf config-manager --enable ol8_codeready_builder -RUN dnf install -y hdf5 hdf5-devel - -# ---------------------------------- Create gamma ---------------------------------- -RUN useradd gamma -USER gamma -WORKDIR /home/gamma -RUN mkdir -p /home/gamma/env -COPY environment.yml /home/gamma/env -COPY requirements.txt /home/gamma/env -SHELL ["/bin/bash", "--login", "-c"] - -USER root - -# ARM processors (Mac) -# Definisci la variabile per il file Miniconda -ARG MINICONDA=Miniconda3-latest-Linux-aarch64.sh -# INTEL/AMD processors -ARG MINICONDA=Miniconda3-latest-Linux-x86_64.sh - -# Scarica Miniconda utilizzando la variabile -RUN wget https://repo.anaconda.com/miniconda/$MINICONDA \ - && chmod +x $MINICONDA \ - && ./$MINICONDA -b -p /opt/conda \ - && rm $MINICONDA - -RUN chown -R gamma:gamma /home/gamma - -USER gamma - -COPY entrypoint.sh /home/gamma/entrypoint.sh - -RUN export PATH=$PATH:/opt/conda/bin && conda config --append channels conda-forge && conda config --set channel_priority strict - -RUN export PATH=$PATH:/opt/conda/bin && conda create -n gamma python=3.12 pip psycopg2 jupyter numpy scipy matplotlib pandas plotly scikit-learn tensorboard tensorflow inotify_simple -y - -RUN export PATH=$PATH:/opt/conda/bin && source activate gamma && export AIRFLOW_VERSION=2.10.3 && \ - export PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" && \ - export CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" && \ - pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" - - -RUN export PATH=$PATH:/opt/conda/bin && conda create -n cosipy python=3.10 pip jupyter notebook && source activate cosipy && git clone https://github.com/cositools/cosipy.git && cd cosipy && git checkout develop && pip install -e . - -RUN mkdir -p ${HOME}/airflow - -USER root -RUN mkdir /shared_dir -RUN chown -R gamma:gamma /shared_dir -RUN mkdir /data01 -RUN chown -R gamma:gamma /data01 -RUN mkdir /data02 -RUN chown -R gamma:gamma /data02 -RUN chmod +x /home/gamma/entrypoint.sh - -USER gamma -RUN mkdir /home/gamma/workspace -#dir to run pipeline -RUN mkdir /home/gamma/workspace/data -RUN mkdir /home/gamma/workspace/data/input -RUN mkdir /home/gamma/workspace/heasarc -RUN mkdir /home/gamma/workspace/log - - -ENV PATH="/opt/conda/bin:$PATH" - -#ENTRYPOINT ["/home/gamma/entrypoint.sh"] diff --git a/env/Dockerfile.airflow b/env/Dockerfile.airflow new file mode 100644 index 0000000..5c50d33 --- /dev/null +++ b/env/Dockerfile.airflow @@ -0,0 +1,93 @@ +# ============================================================================= +# Dockerfile.airflow — Native Python (venv), OracleLinux 8 base +# Base: OracleLinux 8 | Works on amd64 and arm64 +# ============================================================================= + +FROM oraclelinux:8 AS oracle8 + +# ------------------------------ Base dependencies ------------------------------ +# Install core system packages, Python 3.9, and dev libraries +RUN set -eux; \ + dnf module enable -y python39; \ + dnf install -y \ + python39 python39-devel python39-pip \ + wget curl ca-certificates gnupg2 tar bzip2 xz unzip which rsync \ + git cmake3 gcc-c++ gcc make binutils \ + compat-openssl10 openssl-devel \ + libX11-devel libXpm-devel libXft-devel libXext-devel \ + gsl-devel bzip2-devel libffi-devel xz-devel sqlite-devel \ + ncurses ncurses-devel xz libzstd libzstd-devel \ + nmap-ncat chrony; \ + dnf install -y oracle-epel-release-el8; \ + dnf config-manager --enable ol8_codeready_builder; \ + dnf install -y hdf5 hdf5-devel libpq-devel; \ + dnf clean all; rm -rf /var/cache/dnf/* + +# Set python3.9 as default python +RUN alternatives --set python3 /usr/bin/python3.9 && \ + ln -sf /usr/bin/python3.9 /usr/bin/python + +# ------------------------------ Non-root user ------------------------------ +# UID and GID are passed from docker-compose.yaml build args +# Defaults match docker-compose.yaml defaults for consistency +ARG UID= +ARG GID= +RUN set -eux; \ + if getent group "${GID}" >/dev/null; then \ + GN="$(getent group "${GID}" | cut -d: -f1)"; \ + else \ + groupadd -g "${GID}" gamma; GN=gamma; \ + fi; \ + if id -u gamma >/dev/null 2>&1; then \ + usermod -u "${UID}" -g "${GID}" gamma; \ + else \ + useradd -m -u "${UID}" -g "${GID}" -s /bin/bash gamma; \ + fi; \ + mkdir -p /home/gamma && chown -R "${UID}:${GID}" /home/gamma + +ENV HOME=/home/gamma +WORKDIR /home/gamma + +# ------------------------------ Copy Configuration ------------------------------ +RUN mkdir -p /home/gamma/env +COPY environment.yml /home/gamma/env/ +COPY requirements.txt /home/gamma/env/ +COPY alert_users.yaml /home/gamma/env/ +COPY entrypoint-airflow.sh /home/gamma/entrypoint-airflow.sh + +# Fix ownership +RUN chown -R "${UID}:${GID}" /home/gamma + +# ------------------------------ Install Airflow (venv) ------------------------------ +USER gamma + +# Create virtual environment +RUN python3.9 -m venv /home/gamma/venv + +# Activate venv for subsequent commands (simulated by updating PATH) +ENV VIRTUAL_ENV=/home/gamma/venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# Install Airflow +ENV AIRFLOW_VERSION=2.10.3 +ENV PYTHON_VERSION=3.9 +ENV CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" + +RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \ + pip install --no-cache-dir "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" && \ + pip install --no-cache-dir apache-airflow-providers-docker apache-airflow-providers-postgres psycopg2-binary \ + jupyter pandas plotly scikit-learn tensorboard tensorflow inotify_simple + +# ------------------------------ Final Setup ------------------------------ +RUN mkdir -p ${HOME}/airflow +ENV AIRFLOW_HOME=${HOME}/airflow + +# Make entrypoint executable (switch to root briefly) +USER root +RUN chmod +x /home/gamma/entrypoint-airflow.sh +USER gamma + +WORKDIR /home/gamma/workspace + +# Default entrypoint +# ENTRYPOINT ["/home/gamma/entrypoint-airflow.sh"] diff --git a/env/README.md b/env/README.md new file mode 100644 index 0000000..9cc8b72 --- /dev/null +++ b/env/README.md @@ -0,0 +1,586 @@ +# Creating and Managing Cosiflow Modules + +This guide explains how to create, install, and manage modules for Cosiflow, using the `fastpipeline` module as a reference example. + +--- + +## Table of Contents + +1. [Module Structure](#module-structure) +2. [Creating a New Module](#creating-a-new-module) +3. [Installing a Module](#installing-a-module) +4. [Using Configuration Files](#using-configuration-files) +5. [Creating a Module from Scratch](#creating-a-module-from-scratch) +6. [Updating a Module](#updating-a-module) +7. [Removing a Module](#removing-a-module) +8. [Writing a Dockerfile for a Module](#writing-a-dockerfile-for-a-module) + +--- + +## Prerequisites + +Before installing or managing modules with `hot_load_module.sh`, make sure that: + +1. **Cosiflow is already configured and built**: + - You have followed the instructions in `../README.md` to: + - edit `env/docker-compose.yaml` (UID, GID, passwords, ports, etc.), + - create the `data/postgres_data` directory, + - run `docker compose build` from `cosiflow/env`. + +2. **(Recommended) The Cosiflow stack is running**: + - Start the services with `docker compose up -d` from `cosiflow/env`. + - This ensures that any new DAGs and pipeline scripts you install can be discovered by Airflow. + +3. **You are running commands from the host in the `cosiflow/env` directory**: + - All examples such as `./hot_load_module.sh install` assume: + - your current working directory is `cosiflow/env`, + - Cosiflow services (and their volumes) are correctly defined in `docker-compose.yaml`. + +--- + +## Module Structure + +A Cosiflow module follows a standard directory structure: + +``` +your_module_name/ +├── env/ +│ ├── Dockerfile # Container image definition +│ └── requirements.txt # Python dependencies (optional) +├── src/ +│ ├── dags/ # Airflow DAG definitions +│ │ └── *.py # Your DAG files +│ └── pipeline/ # Pipeline scripts executed by tasks +│ └── *.py # Your pipeline scripts +``` + +### Example: Fastpipeline Module Structure + +``` +fastpipeline/ +├── env/ +│ ├── Dockerfile +│ └── requirements.txt +└── src/ + ├── dags/ + │ ├── cosipipe_simdata.py + │ ├── cosidag_lcurve.py + │ └── cosidag_tsmap.py + └── pipeline/ + ├── stage_files.py + ├── bkg_cut.py + └── ... +``` + +--- + +## Creating a New Module + +To create a new module for Cosiflow: + +1. **Create the module directory** in the workspace root (same level as `cosiflow/` and `fastpipeline/`): + + ```bash + mkdir -p your_module_name/src/dags + mkdir -p your_module_name/src/pipeline + mkdir -p your_module_name/env + ``` + +2. **Add your DAG files** in `src/dags/`: + - Each Python file containing a DAG definition will be automatically discovered by Airflow + - DAGs can use the `COSIDAG` framework from `cosiflow/modules/cosidag.py` for reactive, file-driven workflows + +3. **Add your pipeline scripts** in `src/pipeline/`: + - These are the Python scripts that will be executed by your DAG tasks + - They can be called via `DockerOperator` or `PythonOperator` depending on your needs + +4. **Create a Dockerfile** in `env/Dockerfile` (see [Writing a Dockerfile](#writing-a-dockerfile-for-a-module) section) + +5. **Optionally create `requirements.txt`** in `env/` if your module needs Python dependencies + +--- + +## Installing a Module + +Once the core Cosiflow environment has been configured and built (see `../README.md`), you can install a module into the running instance using the `hot_load_module.sh` script: + +```bash +cd cosiflow/env +./hot_load_module.sh install +``` + +### What the installation does + +When run with `install` or `update`, `hot_load_module.sh`: + +1. **Links DAGs**: creates a symbolic link from `/home/gamma/airflow/dags/.cfmodule` to the module DAG directory (by default `src/dags/` or the one defined in the configuration file). +2. **Links Pipeline scripts**: creates a symbolic link from `/home/gamma/airflow/pipeline/.cfmodule` to the module pipeline directory (by default `src/pipeline/` or the one defined in the configuration file). +3. **Manages Python environments and/or Docker images** according to the configuration: + - can create one or more Python virtual environments inside the Airflow container; + - can build a Docker image `:latest` from the module `Dockerfile`. + +### Custom paths from the command line + +If your module uses a different structure, you can specify paths manually: + +```bash +./hot_load_module.sh install \ + -d \ # Default: src/dags + -p \ # Default: src/pipeline + -f # Default: env +``` + +Paths are relative to the module root, unless they are absolute. + +### Quick example (without configuration file) + +```bash +./hot_load_module.sh fastpipeline install +``` + +> **Note**: after installation, Airflow may take a few minutes to scan the DAGs and make them visible in the UI. + +--- + +## Using Configuration Files + +To simplify and standardize module installation, you can define a **YAML configuration file** +in the root of the module (same level as `src/` and `env/`). +The `hot_load_module.sh` script automatically detects it (e.g. `*.config.yaml` or `cosiflow.config.yaml`). + +A concrete example is the `fta-pipe.config.yaml` file of the *Fast Transient Analysis Pipeline* module: + +```yaml +install_mode: both + +paths: + dags: src/dags + pipeline: src/pipeline + images: env + +environments: + cosipy: + requirements: env/requirements.txt + venv_path: /home/gamma/envs/cosipy + enabled: true + description: "Stable cosipy environment" + +default_environment: cosipy +``` + +### Main fields of the configuration file + +- **`install_mode`**: what to install when you run `install`: + - `container`: only builds the module Docker image; + - `environment`: only creates Python environments; + - `both`: creates Python environments **and** builds the Docker image; + - `none`: only creates DAG/pipeline symlinks, without building anything. + +- **`paths`**: + - `dags`: directory containing the DAGs (default `src/dags`); + - `pipeline`: directory with pipeline scripts (default `src/pipeline`); + - `images`: directory containing the `Dockerfile` (default `env`). + +- **`environments`**: + - map of Python environments that `hot_load_module.sh` can create inside the Airflow container; + - for each environment: + - `requirements`: path to the `requirements.txt` file (relative to the module root); + - `venv_path`: path of the virtualenv in the container (e.g. `/home/gamma/envs/cosipy`); + - `enabled`: if `true`, the environment is created automatically; + - `description`: free-text description (for documentation/logs only). + +- **`default_environment`**: + - name of the environment considered “default” (used by scripts or documentation). + +### How to use the configuration file + +If the configuration file is present in the module root, you can install the module with: + +```bash +cd cosiflow/env +./hot_load_module.sh install +``` + +The script: +- reads the configuration file (e.g. `fta-pipe.config.yaml`); +- creates DAG/pipeline symlinks according to the configured paths; +- creates Python environments with `enabled: true`; +- builds the Docker image if requested by `install_mode`. + +Optionally you can override the configuration: + +```bash +# Force creation of the specified environments +./hot_load_module.sh install -e -E env1,env2 + +# Install all environments defined in the YAML file +./hot_load_module.sh install -e -E all +``` + +--- + +## Creating a Module from Scratch + +This section summarizes how to create **from scratch** a new Cosiflow module, +including the folder structure and its configuration file. + +### 1. Structuring the plugin folder + +Assume the module is called `my_new_module` and lives at the same level as `cosiflow/`: + +```bash +mkdir -p my_new_module/src/dags +mkdir -p my_new_module/src/pipeline +mkdir -p my_new_module/env +``` + +Struttura attesa: + +```text +my_new_module/ +├── env/ +│ ├── Dockerfile # Module Docker image +│ └── requirements.txt # Python dependencies for the module environments +├── src/ +│ ├── dags/ # Airflow DAG definitions +│ │ └── my_dag.py +│ └── pipeline/ # Pipeline scripts called by the DAGs +│ └── my_task.py +└── my-module.config.yaml # (recommended) configuration file for hot_load_module.sh +``` + +Guidelines: +- DAGs in `src/dags/` can use the Cosiflow `COSIDAG` framework; +- scripts in `src/pipeline/` should be designed to run: + - either inside the module Docker image; + - or in a Python virtualenv created in the Airflow container. + +### 2. Writing the Configuration File + +In the module root, create a file such as `my-module.config.yaml`: + +```yaml +# What 'install' should do +install_mode: both # container | environment | both | none + +# Where DAGs, pipeline and Dockerfile live +paths: + dags: src/dags + pipeline: src/pipeline + images: env + +# Definition of Python environments inside airflow container +environments: + myenv: + requirements: env/requirements.txt + venv_path: /home/gamma/envs/myenv + enabled: true + description: "Environment for my_new_module" + +default_environment: myenv +``` + +Recommendations: +- keep paths **relative** to the module root whenever possible; +- if the module needs multiple environments (e.g. stable/dev), add them under `environments`; +- use clear descriptions, they will appear in `hot_load_module.sh` logs. + +### 3. Installing the new plugin + +Once: +- you have written at least one DAG in `src/dags/`, +- you have added the scripts in `src/pipeline/`, +- you have prepared `env/Dockerfile` and `env/requirements.txt`, +- you have created the configuration file, + +you can install the module: + +```bash +cd cosiflow/env +./hot_load_module.sh my_new_module install +``` + +This: +- links the module DAGs and pipeline into Airflow; +- creates Python environments defined with `enabled: true`; +- builds the module Docker image (if requested by `install_mode`). + +--- + +## Updating a Module + +To update an already installed module (e.g., after modifying DAGs or pipeline scripts): + +```bash +cd cosiflow/env +./hot_load_module.sh update +``` + +The `update` action performs the same steps as `install`: +- Updates the DAGs and pipeline script links +- Rebuilds the Docker image if the Dockerfile has changed + +**Note**: After updating, it may take a few minutes for Airflow to reload the DAGs and reflect your changes. + +--- + +## Removing a Module + +To remove a module from Cosiflow: + +```bash +cd cosiflow/env +./hot_load_module.sh remove +``` + +### What the removal does: + +1. **Removes DAGs link**: Deletes the symbolic link in `/home/gamma/airflow/dags/.cfmodule` +2. **Removes Pipeline scripts link**: Deletes the symbolic link in `/home/gamma/airflow/pipeline/.cfmodule` +3. **Removes Docker image**: Removes the Docker image `:latest` + +**Note**: After removal, it may take a few minutes for Airflow to stop showing the DAGs in the UI. The DAGs will be automatically disabled and removed from the Airflow database during the next DAG refresh cycle. + +--- + +## Writing a Dockerfile for a Module + +Each module should have a `Dockerfile` in its `env/` directory. This Docker image will be used by `DockerOperator` tasks in your DAGs to execute pipeline scripts in an isolated environment. + +### Basic Structure + +Here's a template based on the `fastpipeline` module: + +```dockerfile +# ============================================================================= +# Dockerfile — Runtime Container for Your Module +# Base: Choose appropriate base image (see recommendations below) +# ============================================================================= + +FROM + +# ------------------------------ System Dependencies ------------------------------ +# Install system libraries required for your scientific packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + git \ + wget \ + curl \ + # Add other system dependencies as needed + && rm -rf /var/lib/apt/lists/* + +# ------------------------------ Non-root user ------------------------------ +ARG UID= +ARG GID= + +# Create group and user with specific UID/GID (must match Airflow container user) +RUN groupadd -g "${GID}" gamma || true && \ + useradd -m -u "${UID}" -g "${GID}" -s /bin/bash gamma || true + +# Create shared directories with correct permissions +RUN mkdir -p /shared_dir /data01 /data02 && \ + chown -R "${UID}:${GID}" /shared_dir /data01 /data02 + +USER gamma +WORKDIR /home/gamma + +# ------------------------------ Python Environment ------------------------------ +# Create virtual environment +RUN python -m venv /home/gamma/envs/your_env_name + +# Copy and install requirements +COPY --chown=gamma:gamma requirements.txt /home/gamma/requirements.txt +RUN . /home/gamma/envs/your_env_name/bin/activate && \ + pip install --no-cache-dir --upgrade pip setuptools wheel && \ + pip install --no-cache-dir -r /home/gamma/requirements.txt + +# ------------------------------ Default Configuration ------------------------------ +ENV VIRTUAL_ENV=/home/gamma/envs/your_env_name +ENV PATH="/home/gamma/envs/your_env_name/bin:$PATH" + +CMD ["/bin/bash"] +``` + +### Example: Fastpipeline Dockerfile + +The `fastpipeline` module uses `python:3.10-slim` as its base image: + +```dockerfile +FROM python:3.10-slim + +# System dependencies for scientific packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + git \ + wget \ + curl \ + libhdf5-dev \ + libgsl-dev \ + zlib1g-dev \ + libbz2-dev \ + liblzma-dev \ + && rm -rf /var/lib/apt/lists/* + +# Non-root user setup +ARG UID=501 +ARG GID=20 +RUN groupadd -g "${GID}" gamma || true && \ + useradd -m -u "${UID}" -g "${GID}" -s /bin/bash gamma || true + +RUN mkdir -p /shared_dir /data01 /data02 && \ + chown -R "${UID}:${GID}" /shared_dir /data01 /data02 + +USER gamma +WORKDIR /home/gamma + +# Python environment with cosipy +RUN python -m venv /home/gamma/envs/cosipy +COPY --chown=gamma:gamma requirements.txt /home/gamma/requirements.txt +RUN . /home/gamma/envs/cosipy/bin/activate && \ + pip install --no-cache-dir --upgrade pip setuptools wheel && \ + pip install --no-cache-dir -r /home/gamma/requirements.txt + +ENV VIRTUAL_ENV=/home/gamma/envs/cosipy +ENV PATH="/home/gamma/envs/cosipy/bin:$PATH" + +CMD ["/bin/bash"] +``` + +### Choosing the Right Base Image + +Selecting an appropriate base image depends on your module's requirements: + +#### **For Lightweight Python Modules** (Recommended for most cases) +- **`python:3.10-slim`** or **`python:3.11-slim`** + - Small image size (~50-100 MB) + - Good for modules that only need Python and standard scientific libraries + - Example: `fastpipeline` module + +#### **For Modules Requiring System Libraries** +- **`python:3.10`** or **`python:3.11`** (full Debian) + - Larger image (~200-300 MB) but includes more system tools + - Use when you need additional system packages or compilers + +#### **For Modules Requiring Specific OS Features** +- **`oraclelinux:8`** (as used in `cosiflow/env/Dockerfile.airflow`) + - Use when you need Oracle Linux-specific packages or compatibility + - Larger image size + - Good for modules that need to match the Airflow container's OS + +#### **For Minimal Dependencies** +- **`python:3.10-alpine`** or **`python:3.11-alpine`** + - Smallest image size (~20-40 MB) + - Use only if you don't need many system libraries (Alpine uses `musl` instead of `glibc`, which can cause compatibility issues) + +### Best Practices + +1. **User ID Matching**: Always set `UID=` and `GID=` to match the Airflow container user (`gamma`). This ensures proper file permissions when mounting volumes. + +2. **Virtual Environments**: Use Python virtual environments to isolate dependencies and avoid conflicts. + +3. **Layer Caching**: Order your Dockerfile commands from least to most frequently changing: + - System dependencies first + - Python environment setup + - Requirements installation + - Application code (if copying) + +4. **Multi-stage Builds** (Optional): For complex modules, consider multi-stage builds to reduce final image size. + +5. **Security**: Always use non-root users (`gamma`) and clean up package caches to reduce image size. + +6. **Requirements File**: Keep a `requirements.txt` file in `env/` listing all Python dependencies with version pins for reproducibility. + +7. **Choosing the Right Operator**: + - **PythonOperator**: Prefer this for simple Python tasks that: + - use only standard library or dependencies already available in the Airflow image; + - are lightweight and stateless; + - do not require a dedicated virtual environment. + This is typically the **fastest option** because it runs in-process with the Airflow worker. + - **BashOperator**: Use for simple shell commands and glue logic: + - calling small CLI tools; + - moving/renaming files; + - orchestrating existing scripts that are already available in the Airflow container. + Keep commands short and idempotent; avoid very complex bash logic that is hard to debug. + - **ExternalPythonOperator** (or equivalent external-Python patterns): Use when a task: + - needs a **specific Python environment** (virtualenv) installed inside the Airflow container; + - depends on heavy or conflicting libraries that you do not want in the base Airflow image; + - should be isolated but still run on the same host/container. + Installing many different environments in the same Airflow container can make it heavier and harder to maintain, so prefer a **small number of well-defined environments**. + Execution speed is typically **slower than `PythonOperator`** because it needs to spawn a separate process and activate a virtualenv. + - **DockerOperator**: Use when a task: + - requires external tools or runtimes (other languages, system tools, heavy scientific stacks); + - must run in a **fully isolated environment** separate from the Airflow container; + - benefits from packaging everything in a dedicated image (reproducibility, portability). + Running many `DockerOperator` tasks in parallel can be resource-intensive (CPU, RAM, I/O), so monitor cluster capacity and concurrency. This is usually the **slowest option** (container startup + I/O), but offers the strongest isolation. + +8. **Performance Hierarchy**: As a rule of thumb for execution speed (from faster to slower): + - `PythonOperator` **>** external-Python-style operators **>** `DockerOperator`. + Start with the simplest/fastest operator that satisfies your isolation and dependency requirements, and move to heavier options only when necessary. + +--- + +## Troubleshooting + +### DAGs Not Appearing After Installation + +- Airflow scans the DAGs directory periodically (default: every 5 minutes) +- Wait a few minutes and refresh the Airflow UI +- Check the Airflow logs for DAG parsing errors +- Verify that your DAG files don't have syntax errors +- If DAGs are not still appearing, shutdown the compose and re-up the compose + +### DAGs still not visible: use `airflow dags list` + +If, after installing/updating a module, DAGs still do not appear in the UI: + +1. **Enter the Airflow container**: + ```bash + cd cosiflow/env + docker compose exec airflow bash + ``` + +2. **Run the diagnostic command**: + ```bash + airflow dags list + ``` + + This command: + - forces the DAG parsing process on the filesystem; + - shows the list of all DAGs that Airflow is actually able to load; + - prints to the terminal any import/parsing errors from `.py` files (stack trace, missing modules, etc.). + +3. **Why this can fix the problem**: + - when you run `airflow dags list`, Airflow re-reads the files in the `dags` folder and refreshes its internal state; + - if there are code errors or missing dependencies, you will see them explicitly in the command output: + - you can then fix the DAG or the module Python environment; + - once errors are fixed, running `airflow dags list` again lets you verify that the DAG is finally loaded. + +### Docker Build Fails + +- Ensure the Dockerfile path is correct (default: `env/Dockerfile`) +- Check that all files referenced in the Dockerfile (e.g., `requirements.txt`) exist +- Verify Docker has enough disk space and memory + +### Permission Errors + +- Ensure the Dockerfile creates the `gamma` user with `UID=501` and `GID=20` +- Check that mounted volumes have correct permissions + +### Module Not Found + +- Verify the module directory exists in the workspace root (same level as `cosiflow/`) +- Check that the module name matches the directory name exactly +- Ensure the `modules_pool` volume is correctly mounted in `docker-compose.yaml` + +--- + +## Summary + +Creating and managing Cosiflow modules is straightforward: + +1. **Create** your module with the standard structure (`src/dags/`, `src/pipeline/`, `env/Dockerfile`) +2. **Install** using `./hot_load_module.sh install` +3. **Update** using `./hot_load_module.sh update` +4. **Remove** using `./hot_load_module.sh remove` + +Remember: Allow a few minutes for Airflow to discover and load DAGs after installation or updates. diff --git a/env/airflow.cfg.postgresql b/env/airflow.cfg similarity index 99% rename from env/airflow.cfg.postgresql rename to env/airflow.cfg index 99bf743..4ed92f8 100644 --- a/env/airflow.cfg.postgresql +++ b/env/airflow.cfg @@ -122,7 +122,7 @@ max_consecutive_failed_dag_runs_per_dag = 0 # # Variable: AIRFLOW__CORE__LOAD_EXAMPLES # -load_examples = True +load_examples = False # Path to the folder containing Airflow plugins # @@ -499,7 +499,7 @@ alembic_ini_file_path = alembic.ini #sql_alchemy_conn = sqlite:////shared_dir/airflow/airflow.db #for LocalExecutor -sql_alchemy_conn = postgresql+psycopg2://airflow_user:secure_password@cosi_postgres/airflow_db +sql_alchemy_conn = postgresql+psycopg2://airflow_user:secure_password@postgres:5432/airflow_db # Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value diff --git a/env/airflow.cfg.SequentialExecutor b/env/airflow.cfg.SequentialExecutor deleted file mode 100644 index 9e794ff..0000000 --- a/env/airflow.cfg.SequentialExecutor +++ /dev/null @@ -1,2542 +0,0 @@ -[core] -# The folder where your airflow pipelines live, most likely a -# subfolder in a code repository. This path must be absolute. -# -# Variable: AIRFLOW__CORE__DAGS_FOLDER -# -dags_folder = /shared_dir/airflow/dags - -# Hostname by providing a path to a callable, which will resolve the hostname. -# The format is "package.function". -# -# For example, default value ``airflow.utils.net.getfqdn`` means that result from patched -# version of `socket.getfqdn() `__, -# see related `CPython Issue `__. -# -# No argument should be required in the function specified. -# If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address`` -# -# Variable: AIRFLOW__CORE__HOSTNAME_CALLABLE -# -hostname_callable = airflow.utils.net.getfqdn - -# A callable to check if a python file has airflow dags defined or not and should -# return ``True`` if it has dags otherwise ``False``. -# If this is not provided, Airflow uses its own heuristic rules. -# -# The function should have the following signature -# -# .. code-block:: python -# -# def func_name(file_path: str, zip_file: zipfile.ZipFile | None = None) -> bool: ... -# -# Variable: AIRFLOW__CORE__MIGHT_CONTAIN_DAG_CALLABLE -# -might_contain_dag_callable = airflow.utils.file.might_contain_dag_via_default_heuristic - -# Default timezone in case supplied date times are naive -# can be `UTC` (default), `system`, or any `IANA ` -# timezone string (e.g. Europe/Amsterdam) -# -# Variable: AIRFLOW__CORE__DEFAULT_TIMEZONE -# -default_timezone = utc - -# The executor class that airflow should use. Choices include -# ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, -# ``KubernetesExecutor``, ``CeleryKubernetesExecutor``, ``LocalKubernetesExecutor`` or the -# full import path to the class when using a custom executor. -# -# Variable: AIRFLOW__CORE__EXECUTOR -# -executor = SequentialExecutor - -# The auth manager class that airflow should use. Full import path to the auth manager class. -# -# Variable: AIRFLOW__CORE__AUTH_MANAGER -# -auth_manager = airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager - -# This defines the maximum number of task instances that can run concurrently per scheduler in -# Airflow, regardless of the worker count. Generally this value, multiplied by the number of -# schedulers in your cluster, is the maximum number of task instances with the running -# state in the metadata database. Setting this value to zero allows unlimited parallelism. -# -# Variable: AIRFLOW__CORE__PARALLELISM -# -parallelism = 32 - -# The maximum number of task instances allowed to run concurrently in each DAG. To calculate -# the number of tasks that is running concurrently for a DAG, add up the number of running -# tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``, -# which is defaulted as ``[core] max_active_tasks_per_dag``. -# -# An example scenario when this would be useful is when you want to stop a new dag with an early -# start date from stealing all the executor slots in a cluster. -# -# Variable: AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG -# -max_active_tasks_per_dag = 16 - -# Are DAGs paused by default at creation -# -# Variable: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION -# -dags_are_paused_at_creation = True - -# The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs -# if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``, -# which is defaulted as ``[core] max_active_runs_per_dag``. -# -# Variable: AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG -# -max_active_runs_per_dag = 16 - -# (experimental) The maximum number of consecutive DAG failures before DAG is automatically paused. -# This is also configurable per DAG level with ``max_consecutive_failed_dag_runs``, -# which is defaulted as ``[core] max_consecutive_failed_dag_runs_per_dag``. -# If not specified, then the value is considered as 0, -# meaning that the dags are never paused out by default. -# -# Variable: AIRFLOW__CORE__MAX_CONSECUTIVE_FAILED_DAG_RUNS_PER_DAG -# -max_consecutive_failed_dag_runs_per_dag = 0 - -# The name of the method used in order to start Python processes via the multiprocessing module. -# This corresponds directly with the options available in the Python docs: -# `multiprocessing.set_start_method -# `__ -# must be one of the values returned by `multiprocessing.get_all_start_methods() -# `__. -# -# Example: mp_start_method = fork -# -# Variable: AIRFLOW__CORE__MP_START_METHOD -# -# mp_start_method = - -# Whether to load the DAG examples that ship with Airflow. It's good to -# get started, but you probably want to set this to ``False`` in a production -# environment -# -# Variable: AIRFLOW__CORE__LOAD_EXAMPLES -# -load_examples = True - -# Path to the folder containing Airflow plugins -# -# Variable: AIRFLOW__CORE__PLUGINS_FOLDER -# -plugins_folder = /shared_dir/airflow/plugins - -# Should tasks be executed via forking of the parent process -# -# * ``False``: Execute via forking of the parent process -# * ``True``: Spawning a new python process, slower than fork, but means plugin changes picked -# up by tasks straight away -# -# Variable: AIRFLOW__CORE__EXECUTE_TASKS_NEW_PYTHON_INTERPRETER -# -execute_tasks_new_python_interpreter = False - -# Secret key to save connection passwords in the db -# -# Variable: AIRFLOW__CORE__FERNET_KEY -# -fernet_key = - -# Whether to disable pickling dags -# -# Variable: AIRFLOW__CORE__DONOT_PICKLE -# -donot_pickle = True - -# How long before timing out a python file import -# -# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT -# -dagbag_import_timeout = 30.0 - -# Should a traceback be shown in the UI for dagbag import errors, -# instead of just the exception message -# -# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACKS -# -dagbag_import_error_tracebacks = True - -# If tracebacks are shown, how many entries from the traceback should be shown -# -# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH -# -dagbag_import_error_traceback_depth = 2 - -# How long before timing out a DagFileProcessor, which processes a dag file -# -# Variable: AIRFLOW__CORE__DAG_FILE_PROCESSOR_TIMEOUT -# -dag_file_processor_timeout = 50 - -# The class to use for running task instances in a subprocess. -# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class -# when using a custom task runner. -# -# Variable: AIRFLOW__CORE__TASK_RUNNER -# -task_runner = StandardTaskRunner - -# If set, tasks without a ``run_as_user`` argument will be run with this user -# Can be used to de-elevate a sudo user running Airflow when executing tasks -# -# Variable: AIRFLOW__CORE__DEFAULT_IMPERSONATION -# -default_impersonation = - -# What security module to use (for example kerberos) -# -# Variable: AIRFLOW__CORE__SECURITY -# -security = - -# Turn unit test mode on (overwrites many configuration options with test -# values at runtime) -# -# Variable: AIRFLOW__CORE__UNIT_TEST_MODE -# -unit_test_mode = False - -# Whether to enable pickling for xcom (note that this is insecure and allows for -# RCE exploits). -# -# Variable: AIRFLOW__CORE__ENABLE_XCOM_PICKLING -# -enable_xcom_pickling = False - -# What classes can be imported during deserialization. This is a multi line value. -# The individual items will be parsed as a pattern to a glob function. -# Python built-in classes (like dict) are always allowed. -# -# Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES -# -allowed_deserialization_classes = airflow.* - -# What classes can be imported during deserialization. This is a multi line value. -# The individual items will be parsed as regexp patterns. -# This is a secondary option to ``[core] allowed_deserialization_classes``. -# -# Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES_REGEXP -# -allowed_deserialization_classes_regexp = - -# When a task is killed forcefully, this is the amount of time in seconds that -# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED -# -# Variable: AIRFLOW__CORE__KILLED_TASK_CLEANUP_TIME -# -killed_task_cleanup_time = 60 - -# Whether to override params with dag_run.conf. If you pass some key-value pairs -# through ``airflow dags backfill -c`` or -# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. -# -# Variable: AIRFLOW__CORE__DAG_RUN_CONF_OVERRIDES_PARAMS -# -dag_run_conf_overrides_params = True - -# If enabled, Airflow will only scan files containing both ``DAG`` and ``airflow`` (case-insensitive). -# -# Variable: AIRFLOW__CORE__DAG_DISCOVERY_SAFE_MODE -# -dag_discovery_safe_mode = True - -# The pattern syntax used in the -# `.airflowignore -# `__ -# files in the DAG directories. Valid values are ``regexp`` or ``glob``. -# -# Variable: AIRFLOW__CORE__DAG_IGNORE_FILE_SYNTAX -# -dag_ignore_file_syntax = regexp - -# The number of retries each task is going to have by default. Can be overridden at dag or task level. -# -# Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRIES -# -default_task_retries = 0 - -# The number of seconds each task is going to wait by default between retries. Can be overridden at -# dag or task level. -# -# Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRY_DELAY -# -default_task_retry_delay = 300 - -# The maximum delay (in seconds) each task is going to wait by default between retries. -# This is a global setting and cannot be overridden at task or DAG level. -# -# Variable: AIRFLOW__CORE__MAX_TASK_RETRY_DELAY -# -max_task_retry_delay = 86400 - -# The weighting method used for the effective total priority weight of the task -# -# Variable: AIRFLOW__CORE__DEFAULT_TASK_WEIGHT_RULE -# -default_task_weight_rule = downstream - -# Maximum possible time (in seconds) that task will have for execution of auxiliary processes -# (like listeners, mini scheduler...) after task is marked as success.. -# -# Variable: AIRFLOW__CORE__TASK_SUCCESS_OVERTIME -# -task_success_overtime = 20 - -# The default task execution_timeout value for the operators. Expected an integer value to -# be passed into timedelta as seconds. If not specified, then the value is considered as None, -# meaning that the operators are never timed out by default. -# -# Variable: AIRFLOW__CORE__DEFAULT_TASK_EXECUTION_TIMEOUT -# -default_task_execution_timeout = - -# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. -# -# Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_UPDATE_INTERVAL -# -min_serialized_dag_update_interval = 30 - -# If ``True``, serialized DAGs are compressed before writing to DB. -# -# .. note:: -# -# This will disable the DAG dependencies view -# -# Variable: AIRFLOW__CORE__COMPRESS_SERIALIZED_DAGS -# -compress_serialized_dags = False - -# Fetching serialized DAG can not be faster than a minimum interval to reduce database -# read rate. This config controls when your DAGs are updated in the Webserver -# -# Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_FETCH_INTERVAL -# -min_serialized_dag_fetch_interval = 10 - -# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store -# in the Database. -# All the template_fields for each of Task Instance are stored in the Database. -# Keeping this number small may cause an error when you try to view ``Rendered`` tab in -# TaskInstance view for older tasks. -# -# Variable: AIRFLOW__CORE__MAX_NUM_RENDERED_TI_FIELDS_PER_TASK -# -max_num_rendered_ti_fields_per_task = 30 - -# On each dagrun check against defined SLAs -# -# Variable: AIRFLOW__CORE__CHECK_SLAS -# -check_slas = True - -# Path to custom XCom class that will be used to store and resolve operators results -# -# Example: xcom_backend = path.to.CustomXCom -# -# Variable: AIRFLOW__CORE__XCOM_BACKEND -# -xcom_backend = airflow.models.xcom.BaseXCom - -# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, -# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. -# -# Variable: AIRFLOW__CORE__LAZY_LOAD_PLUGINS -# -lazy_load_plugins = True - -# By default Airflow providers are lazily-discovered (discovery and imports happen only when required). -# Set it to ``False``, if you want to discover providers whenever 'airflow' is invoked via cli or -# loaded from module. -# -# Variable: AIRFLOW__CORE__LAZY_DISCOVER_PROVIDERS -# -lazy_discover_providers = True - -# Hide sensitive **Variables** or **Connection extra json keys** from UI -# and task logs when set to ``True`` -# -# .. note:: -# -# Connection passwords are always hidden in logs -# -# Variable: AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS -# -hide_sensitive_var_conn_fields = True - -# A comma-separated list of extra sensitive keywords to look for in variables names or connection's -# extra JSON. -# -# Variable: AIRFLOW__CORE__SENSITIVE_VAR_CONN_NAMES -# -sensitive_var_conn_names = - -# Task Slot counts for ``default_pool``. This setting would not have any effect in an existing -# deployment where the ``default_pool`` is already created. For existing deployments, users can -# change the number of slots using Webserver, API or the CLI -# -# Variable: AIRFLOW__CORE__DEFAULT_POOL_TASK_SLOT_COUNT -# -default_pool_task_slot_count = 128 - -# The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a -# length exceeding this value, the task pushing the XCom will be failed automatically to prevent the -# mapped tasks from clogging the scheduler. -# -# Variable: AIRFLOW__CORE__MAX_MAP_LENGTH -# -max_map_length = 1024 - -# The default umask to use for process when run in daemon mode (scheduler, worker, etc.) -# -# This controls the file-creation mode mask which determines the initial value of file permission bits -# for newly created files. -# -# This value is treated as an octal-integer. -# -# Variable: AIRFLOW__CORE__DAEMON_UMASK -# -daemon_umask = 0o077 - -# Class to use as dataset manager. -# -# Example: dataset_manager_class = airflow.datasets.manager.DatasetManager -# -# Variable: AIRFLOW__CORE__DATASET_MANAGER_CLASS -# -# dataset_manager_class = - -# Kwargs to supply to dataset manager. -# -# Example: dataset_manager_kwargs = {"some_param": "some_value"} -# -# Variable: AIRFLOW__CORE__DATASET_MANAGER_KWARGS -# -# dataset_manager_kwargs = - -# Dataset URI validation should raise an exception if it is not compliant with AIP-60. -# By default this configuration is false, meaning that Airflow 2.x only warns the user. -# In Airflow 3, this configuration will be enabled by default. -# -# Variable: AIRFLOW__CORE__STRICT_DATASET_URI_VALIDATION -# -strict_dataset_uri_validation = False - -# (experimental) Whether components should use Airflow Internal API for DB connectivity. -# -# Variable: AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION -# -database_access_isolation = False - -# (experimental) Airflow Internal API url. -# Only used if ``[core] database_access_isolation`` is ``True``. -# -# Example: internal_api_url = http://localhost:8080 -# -# Variable: AIRFLOW__CORE__INTERNAL_API_URL -# -# internal_api_url = - -# Secret key used to authenticate internal API clients to core. It should be as random as possible. -# However, when running more than 1 instances of webserver / internal API services, make sure all -# of them use the same ``secret_key`` otherwise calls will fail on authentication. -# The authentication token generated using the secret key has a short expiry time though - make -# sure that time on ALL the machines that you run airflow components on is synchronized -# (for example using ntpd) otherwise you might get "forbidden" errors when the logs are accessed. -# -# Variable: AIRFLOW__CORE__INTERNAL_API_SECRET_KEY -# -internal_api_secret_key = LtXATvV6AP0LJYsI6tVfnA== - -# The ability to allow testing connections across Airflow UI, API and CLI. -# Supported options: ``Disabled``, ``Enabled``, ``Hidden``. Default: Disabled -# Disabled - Disables the test connection functionality and disables the Test Connection button in UI. -# Enabled - Enables the test connection functionality and shows the Test Connection button in UI. -# Hidden - Disables the test connection functionality and hides the Test Connection button in UI. -# Before setting this to Enabled, make sure that you review the users who are able to add/edit -# connections and ensure they are trusted. Connection testing can be done maliciously leading to -# undesired and insecure outcomes. -# See `Airflow Security Model: Capabilities of authenticated UI users -# `__ -# for more details. -# -# Variable: AIRFLOW__CORE__TEST_CONNECTION -# -test_connection = Disabled - -# The maximum length of the rendered template field. If the value to be stored in the -# rendered template field exceeds this size, it's redacted. -# -# Variable: AIRFLOW__CORE__MAX_TEMPLATED_FIELD_LENGTH -# -max_templated_field_length = 4096 - -[database] -# Path to the ``alembic.ini`` file. You can either provide the file path relative -# to the Airflow home directory or the absolute path if it is located elsewhere. -# -# Variable: AIRFLOW__DATABASE__ALEMBIC_INI_FILE_PATH -# -alembic_ini_file_path = alembic.ini - -# The SQLAlchemy connection string to the metadata database. -# SQLAlchemy supports many different database engines. -# See: `Set up a Database Backend: Database URI -# `__ -# for more details. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN -# -sql_alchemy_conn = sqlite:////shared_dir/airflow/airflow.db - -#for LocalExecutor -#sql_alchemy_conn = postgresql+psycopg2://username:password@hostname/dbname - -# Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value -# -# Example: sql_alchemy_engine_args = {"arg1": true} -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS -# -# sql_alchemy_engine_args = - -# The encoding for the databases -# -# Variable: AIRFLOW__DATABASE__SQL_ENGINE_ENCODING -# -sql_engine_encoding = utf-8 - -# Collation for ``dag_id``, ``task_id``, ``key``, ``external_executor_id`` columns -# in case they have different encoding. -# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb`` -# the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed -# the maximum size of allowed index when collation is set to ``utf8mb4`` variant, see -# `GitHub Issue Comment `__ -# for more details. -# -# Variable: AIRFLOW__DATABASE__SQL_ENGINE_COLLATION_FOR_IDS -# -# sql_engine_collation_for_ids = - -# If SQLAlchemy should pool database connections. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_ENABLED -# -sql_alchemy_pool_enabled = True - -# The SQLAlchemy pool size is the maximum number of database connections -# in the pool. 0 indicates no limit. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_SIZE -# -sql_alchemy_pool_size = 5 - -# The maximum overflow size of the pool. -# When the number of checked-out connections reaches the size set in pool_size, -# additional connections will be returned up to this limit. -# When those additional connections are returned to the pool, they are disconnected and discarded. -# It follows then that the total number of simultaneous connections the pool will allow -# is **pool_size** + **max_overflow**, -# and the total number of "sleeping" connections the pool will allow is pool_size. -# max_overflow can be set to ``-1`` to indicate no overflow limit; -# no limit will be placed on the total number of concurrent connections. Defaults to ``10``. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_MAX_OVERFLOW -# -sql_alchemy_max_overflow = 10 - -# The SQLAlchemy pool recycle is the number of seconds a connection -# can be idle in the pool before it is invalidated. This config does -# not apply to sqlite. If the number of DB connections is ever exceeded, -# a lower config value will allow the system to recover faster. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_RECYCLE -# -sql_alchemy_pool_recycle = 1800 - -# Check connection at the start of each connection pool checkout. -# Typically, this is a simple statement like "SELECT 1". -# See `SQLAlchemy Pooling: Disconnect Handling - Pessimistic -# `__ -# for more details. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_PRE_PING -# -sql_alchemy_pool_pre_ping = True - -# The schema to use for the metadata database. -# SQLAlchemy supports databases with the concept of multiple schemas. -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SCHEMA -# -sql_alchemy_schema = - -# Import path for connect args in SQLAlchemy. Defaults to an empty dict. -# This is useful when you want to configure db engine args that SQLAlchemy won't parse -# in connection string. This can be set by passing a dictionary containing the create engine parameters. -# For more details about passing create engine parameters (keepalives variables, timeout etc) -# in Postgres DB Backend see `Setting up a PostgreSQL Database -# `__ -# e.g ``connect_args={"timeout":30}`` can be defined in ``airflow_local_settings.py`` and -# can be imported as shown below -# -# Example: sql_alchemy_connect_args = airflow_local_settings.connect_args -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONNECT_ARGS -# -# sql_alchemy_connect_args = - -# Important Warning: Use of sql_alchemy_session_maker Highly Discouraged -# Import path for function which returns 'sqlalchemy.orm.sessionmaker'. -# Improper configuration of sql_alchemy_session_maker can lead to serious issues, -# including data corruption, unrecoverable application crashes. Please review the SQLAlchemy -# documentation for detailed guidance on proper configuration and best practices. -# -# Example: sql_alchemy_session_maker = airflow_local_settings._sessionmaker -# -# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SESSION_MAKER -# -# sql_alchemy_session_maker = - -# Whether to load the default connections that ship with Airflow when ``airflow db init`` is called. -# It's good to get started, but you probably want to set this to ``False`` in a production environment. -# -# Variable: AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS -# -load_default_connections = True - -# Number of times the code should be retried in case of DB Operational Errors. -# Not all transactions will be retried as it can cause undesired state. -# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. -# -# Variable: AIRFLOW__DATABASE__MAX_DB_RETRIES -# -max_db_retries = 3 - -# Whether to run alembic migrations during Airflow start up. Sometimes this operation can be expensive, -# and the users can assert the correct version through other means (e.g. through a Helm chart). -# Accepts ``True`` or ``False``. -# -# Variable: AIRFLOW__DATABASE__CHECK_MIGRATIONS -# -check_migrations = True - -[logging] -# The folder where airflow should store its log files. -# This path must be absolute. -# There are a few existing configurations that assume this is set to the default. -# If you choose to override this you may need to update the -# ``[logging] dag_processor_manager_log_location`` and -# ``[logging] child_process_log_directory settings`` as well. -# -# Variable: AIRFLOW__LOGGING__BASE_LOG_FOLDER -# -base_log_folder = /shared_dir/airflow/logs - -# Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. -# Set this to ``True`` if you want to enable remote logging. -# -# Variable: AIRFLOW__LOGGING__REMOTE_LOGGING -# -remote_logging = False - -# Users must supply an Airflow connection id that provides access to the storage -# location. Depending on your remote logging service, this may only be used for -# reading logs, not writing them. -# -# Variable: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID -# -remote_log_conn_id = - -# Whether the local log files for GCS, S3, WASB and OSS remote logging should be deleted after -# they are uploaded to the remote location. -# -# Variable: AIRFLOW__LOGGING__DELETE_LOCAL_LOGS -# -delete_local_logs = False - -# Path to Google Credential JSON file. If omitted, authorization based on `the Application Default -# Credentials -# `__ will -# be used. -# -# Variable: AIRFLOW__LOGGING__GOOGLE_KEY_PATH -# -google_key_path = - -# Storage bucket URL for remote logging -# S3 buckets should start with **s3://** -# Cloudwatch log groups should start with **cloudwatch://** -# GCS buckets should start with **gs://** -# WASB buckets should start with **wasb** just to help Airflow select correct handler -# Stackdriver logs should start with **stackdriver://** -# -# Variable: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER -# -remote_base_log_folder = - -# The remote_task_handler_kwargs param is loaded into a dictionary and passed to the ``__init__`` -# of remote task handler and it overrides the values provided by Airflow config. For example if you set -# ``delete_local_logs=False`` and you provide ``{"delete_local_copy": true}``, then the local -# log files will be deleted after they are uploaded to remote location. -# -# Example: remote_task_handler_kwargs = {"delete_local_copy": true} -# -# Variable: AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS -# -remote_task_handler_kwargs = - -# Use server-side encryption for logs stored in S3 -# -# Variable: AIRFLOW__LOGGING__ENCRYPT_S3_LOGS -# -encrypt_s3_logs = False - -# Logging level. -# -# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. -# -# Variable: AIRFLOW__LOGGING__LOGGING_LEVEL -# -logging_level = INFO - -# Logging level for celery. If not set, it uses the value of logging_level -# -# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. -# -# Variable: AIRFLOW__LOGGING__CELERY_LOGGING_LEVEL -# -celery_logging_level = - -# Logging level for Flask-appbuilder UI. -# -# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. -# -# Variable: AIRFLOW__LOGGING__FAB_LOGGING_LEVEL -# -fab_logging_level = WARNING - -# Logging class -# Specify the class that will specify the logging configuration -# This class has to be on the python classpath -# -# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG -# -# Variable: AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS -# -logging_config_class = - -# Flag to enable/disable Colored logs in Console -# Colour the logs when the controlling terminal is a TTY. -# -# Variable: AIRFLOW__LOGGING__COLORED_CONSOLE_LOG -# -colored_console_log = True - -# Log format for when Colored logs is enabled -# -# Variable: AIRFLOW__LOGGING__COLORED_LOG_FORMAT -# -colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s - -# Specifies the class utilized by Airflow to implement colored logging -# -# Variable: AIRFLOW__LOGGING__COLORED_FORMATTER_CLASS -# -colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter - -# Format of Log line -# -# Variable: AIRFLOW__LOGGING__LOG_FORMAT -# -log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s - -# Defines the format of log messages for simple logging configuration -# -# Variable: AIRFLOW__LOGGING__SIMPLE_LOG_FORMAT -# -simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s - -# Where to send dag parser logs. If "file", logs are sent to log files defined by child_process_log_directory. -# -# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_TARGET -# -dag_processor_log_target = file - -# Format of Dag Processor Log line -# -# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_FORMAT -# -dag_processor_log_format = [%%(asctime)s] [SOURCE:DAG_PROCESSOR] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s - -# Determines the formatter class used by Airflow for structuring its log messages -# The default formatter class is timezone-aware, which means that timestamps attached to log entries -# will be adjusted to reflect the local timezone of the Airflow instance -# -# Variable: AIRFLOW__LOGGING__LOG_FORMATTER_CLASS -# -log_formatter_class = airflow.utils.log.timezone_aware.TimezoneAware - -# An import path to a function to add adaptations of each secret added with -# ``airflow.utils.log.secrets_masker.mask_secret`` to be masked in log messages. The given function -# is expected to require a single parameter: the secret to be adapted. It may return a -# single adaptation of the secret or an iterable of adaptations to each be masked as secrets. -# The original secret will be masked as well as any adaptations returned. -# -# Example: secret_mask_adapter = urllib.parse.quote -# -# Variable: AIRFLOW__LOGGING__SECRET_MASK_ADAPTER -# -secret_mask_adapter = - -# Specify prefix pattern like mentioned below with stream handler ``TaskHandlerWithCustomFormatter`` -# -# Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{ti.try_number}} -# -# Variable: AIRFLOW__LOGGING__TASK_LOG_PREFIX_TEMPLATE -# -task_log_prefix_template = - -# Formatting for how airflow generates file names/paths for each task run. -# -# Variable: AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE -# -log_filename_template = dag_id={{ ti.dag_id }}/run_id={{ ti.run_id }}/task_id={{ ti.task_id }}/{%% if ti.map_index >= 0 %%}map_index={{ ti.map_index }}/{%% endif %%}attempt={{ try_number }}.log - -# Formatting for how airflow generates file names for log -# -# Variable: AIRFLOW__LOGGING__LOG_PROCESSOR_FILENAME_TEMPLATE -# -log_processor_filename_template = {{ filename }}.log - -# Full path of dag_processor_manager logfile. -# -# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_LOCATION -# -dag_processor_manager_log_location = /shared_dir/airflow/logs/dag_processor_manager/dag_processor_manager.log - -# Whether DAG processor manager will write logs to stdout -# -# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_STDOUT -# -dag_processor_manager_log_stdout = False - -# Name of handler to read task instance logs. -# Defaults to use ``task`` handler. -# -# Variable: AIRFLOW__LOGGING__TASK_LOG_READER -# -task_log_reader = task - -# A comma\-separated list of third-party logger names that will be configured to print messages to -# consoles\. -# -# Example: extra_logger_names = connexion,sqlalchemy -# -# Variable: AIRFLOW__LOGGING__EXTRA_LOGGER_NAMES -# -extra_logger_names = - -# When you start an Airflow worker, Airflow starts a tiny web server -# subprocess to serve the workers local log files to the airflow main -# web server, who then builds pages and sends them to users. This defines -# the port on which the logs are served. It needs to be unused, and open -# visible from the main web server to connect into the workers. -# -# Variable: AIRFLOW__LOGGING__WORKER_LOG_SERVER_PORT -# -worker_log_server_port = 8793 - -# Port to serve logs from for triggerer. -# See ``[logging] worker_log_server_port`` description for more info. -# -# Variable: AIRFLOW__LOGGING__TRIGGER_LOG_SERVER_PORT -# -trigger_log_server_port = 8794 - -# We must parse timestamps to interleave logs between trigger and task. To do so, -# we need to parse timestamps in log files. In case your log format is non-standard, -# you may provide import path to callable which takes a string log line and returns -# the timestamp (datetime.datetime compatible). -# -# Example: interleave_timestamp_parser = path.to.my_func -# -# Variable: AIRFLOW__LOGGING__INTERLEAVE_TIMESTAMP_PARSER -# -# interleave_timestamp_parser = - -# Permissions in the form or of octal string as understood by chmod. The permissions are important -# when you use impersonation, when logs are written by a different user than airflow. The most secure -# way of configuring it in this case is to add both users to the same group and make it the default -# group of both users. Group-writeable logs are default in airflow, but you might decide that you are -# OK with having the logs other-writeable, in which case you should set it to ``0o777``. You might -# decide to add more security if you do not use impersonation and change it to ``0o755`` to make it -# only owner-writeable. You can also make it just readable only for owner by changing it to ``0o700`` -# if all the access (read/write) for your logs happens from the same user. -# -# Example: file_task_handler_new_folder_permissions = 0o775 -# -# Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FOLDER_PERMISSIONS -# -file_task_handler_new_folder_permissions = 0o775 - -# Permissions in the form or of octal string as understood by chmod. The permissions are important -# when you use impersonation, when logs are written by a different user than airflow. The most secure -# way of configuring it in this case is to add both users to the same group and make it the default -# group of both users. Group-writeable logs are default in airflow, but you might decide that you are -# OK with having the logs other-writeable, in which case you should set it to ``0o666``. You might -# decide to add more security if you do not use impersonation and change it to ``0o644`` to make it -# only owner-writeable. You can also make it just readable only for owner by changing it to ``0o600`` -# if all the access (read/write) for your logs happens from the same user. -# -# Example: file_task_handler_new_file_permissions = 0o664 -# -# Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FILE_PERMISSIONS -# -file_task_handler_new_file_permissions = 0o664 - -# By default Celery sends all logs into stderr. -# If enabled any previous logging handlers will get *removed*. -# With this option AirFlow will create new handlers -# and send low level logs like INFO and WARNING to stdout, -# while sending higher severity logs to stderr. -# -# Variable: AIRFLOW__LOGGING__CELERY_STDOUT_STDERR_SEPARATION -# -celery_stdout_stderr_separation = False - -# If enabled, Airflow may ship messages to task logs from outside the task run context, e.g. from -# the scheduler, executor, or callback execution context. This can help in circumstances such as -# when there's something blocking the execution of the task and ordinarily there may be no task -# logs at all. -# This is set to ``True`` by default. If you encounter issues with this feature -# (e.g. scheduler performance issues) it can be disabled. -# -# Variable: AIRFLOW__LOGGING__ENABLE_TASK_CONTEXT_LOGGER -# -enable_task_context_logger = True - -# A comma separated list of keywords related to errors whose presence should display the line in red -# color in UI -# -# Variable: AIRFLOW__LOGGING__COLOR_LOG_ERROR_KEYWORDS -# -color_log_error_keywords = error,exception - -# A comma separated list of keywords related to warning whose presence should display the line in yellow -# color in UI -# -# Variable: AIRFLOW__LOGGING__COLOR_LOG_WARNING_KEYWORDS -# -color_log_warning_keywords = warn - -[metrics] -# `StatsD `__ integration settings. - -# If true, ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` will use -# regex pattern matching anywhere within the metric name instead of only prefix matching -# at the start of the name. -# -# Variable: AIRFLOW__METRICS__METRICS_USE_PATTERN_MATCH -# -metrics_use_pattern_match = False - -# Configure an allow list (comma separated string) to send only certain metrics. -# If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix. -# If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match. -# -# Example: metrics_allow_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout" -# -# Variable: AIRFLOW__METRICS__METRICS_ALLOW_LIST -# -metrics_allow_list = - -# Configure a block list (comma separated string) to block certain metrics from being emitted. -# If ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` are both configured, -# ``[metrics] metrics_block_list`` is ignored. -# -# If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix. -# -# If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match. -# -# Example: metrics_block_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout" -# -# Variable: AIRFLOW__METRICS__METRICS_BLOCK_LIST -# -metrics_block_list = - -# Enables sending metrics to StatsD. -# -# Variable: AIRFLOW__METRICS__STATSD_ON -# -statsd_on = False - -# Specifies the host address where the StatsD daemon (or server) is running -# -# Variable: AIRFLOW__METRICS__STATSD_HOST -# -statsd_host = localhost - -# Specifies the port on which the StatsD daemon (or server) is listening to -# -# Variable: AIRFLOW__METRICS__STATSD_PORT -# -statsd_port = 8125 - -# Defines the namespace for all metrics sent from Airflow to StatsD -# -# Variable: AIRFLOW__METRICS__STATSD_PREFIX -# -statsd_prefix = airflow - -# A function that validate the StatsD stat name, apply changes to the stat name if necessary and return -# the transformed stat name. -# -# The function should have the following signature -# -# .. code-block:: python -# -# def func_name(stat_name: str) -> str: ... -# -# Variable: AIRFLOW__METRICS__STAT_NAME_HANDLER -# -stat_name_handler = - -# To enable datadog integration to send airflow metrics. -# -# Variable: AIRFLOW__METRICS__STATSD_DATADOG_ENABLED -# -statsd_datadog_enabled = False - -# List of datadog tags attached to all metrics(e.g: ``key1:value1,key2:value2``) -# -# Variable: AIRFLOW__METRICS__STATSD_DATADOG_TAGS -# -statsd_datadog_tags = - -# Set to ``False`` to disable metadata tags for some of the emitted metrics -# -# Variable: AIRFLOW__METRICS__STATSD_DATADOG_METRICS_TAGS -# -statsd_datadog_metrics_tags = True - -# If you want to utilise your own custom StatsD client set the relevant -# module path below. -# Note: The module path must exist on your -# `PYTHONPATH ` -# for Airflow to pick it up -# -# Variable: AIRFLOW__METRICS__STATSD_CUSTOM_CLIENT_PATH -# -# statsd_custom_client_path = - -# If you want to avoid sending all the available metrics tags to StatsD, -# you can configure a block list of prefixes (comma separated) to filter out metric tags -# that start with the elements of the list (e.g: ``job_id,run_id``) -# -# Example: statsd_disabled_tags = job_id,run_id,dag_id,task_id -# -# Variable: AIRFLOW__METRICS__STATSD_DISABLED_TAGS -# -statsd_disabled_tags = job_id,run_id - -# To enable sending Airflow metrics with StatsD-Influxdb tagging convention. -# -# Variable: AIRFLOW__METRICS__STATSD_INFLUXDB_ENABLED -# -statsd_influxdb_enabled = False - -# Enables sending metrics to OpenTelemetry. -# -# Variable: AIRFLOW__METRICS__OTEL_ON -# -otel_on = False - -# Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends -# metrics and traces. -# -# Variable: AIRFLOW__METRICS__OTEL_HOST -# -otel_host = localhost - -# Specifies the port of the OpenTelemetry Collector that is listening to. -# -# Variable: AIRFLOW__METRICS__OTEL_PORT -# -otel_port = 8889 - -# The prefix for the Airflow metrics. -# -# Variable: AIRFLOW__METRICS__OTEL_PREFIX -# -otel_prefix = airflow - -# Defines the interval, in milliseconds, at which Airflow sends batches of metrics and traces -# to the configured OpenTelemetry Collector. -# -# Variable: AIRFLOW__METRICS__OTEL_INTERVAL_MILLISECONDS -# -otel_interval_milliseconds = 60000 - -# If ``True``, all metrics are also emitted to the console. Defaults to ``False``. -# -# Variable: AIRFLOW__METRICS__OTEL_DEBUGGING_ON -# -otel_debugging_on = False - -# The default service name of traces. -# -# Variable: AIRFLOW__METRICS__OTEL_SERVICE -# -otel_service = Airflow - -# If ``True``, SSL will be enabled. Defaults to ``False``. -# To establish an HTTPS connection to the OpenTelemetry collector, -# you need to configure the SSL certificate and key within the OpenTelemetry collector's -# ``config.yml`` file. -# -# Variable: AIRFLOW__METRICS__OTEL_SSL_ACTIVE -# -otel_ssl_active = False - -[traces] -# Distributed traces integration settings. - -# Enables sending traces to OpenTelemetry. -# -# Variable: AIRFLOW__TRACES__OTEL_ON -# -otel_on = False - -# Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends -# traces. -# -# Variable: AIRFLOW__TRACES__OTEL_HOST -# -otel_host = localhost - -# Specifies the port of the OpenTelemetry Collector that is listening to. -# -# Variable: AIRFLOW__TRACES__OTEL_PORT -# -otel_port = 8889 - -# The default service name of traces. -# -# Variable: AIRFLOW__TRACES__OTEL_SERVICE -# -otel_service = Airflow - -# If True, all traces are also emitted to the console. Defaults to False. -# -# Variable: AIRFLOW__TRACES__OTEL_DEBUGGING_ON -# -otel_debugging_on = False - -# If True, SSL will be enabled. Defaults to False. -# To establish an HTTPS connection to the OpenTelemetry collector, -# you need to configure the SSL certificate and key within the OpenTelemetry collector's -# config.yml file. -# -# Variable: AIRFLOW__TRACES__OTEL_SSL_ACTIVE -# -otel_ssl_active = False - -# If True, after the task is complete, the full task log messages will be added as the -# span events, chunked by 64k size. defaults to False. -# -# Variable: AIRFLOW__TRACES__OTEL_TASK_LOG_EVENT -# -otel_task_log_event = False - -[secrets] -# Full class name of secrets backend to enable (will precede env vars and metastore in search path) -# -# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend -# -# Variable: AIRFLOW__SECRETS__BACKEND -# -backend = - -# The backend_kwargs param is loaded into a dictionary and passed to ``__init__`` -# of secrets backend class. See documentation for the secrets backend you are using. -# JSON is expected. -# -# Example for AWS Systems Manager ParameterStore: -# ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` -# -# Variable: AIRFLOW__SECRETS__BACKEND_KWARGS -# -backend_kwargs = - -# .. note:: |experimental| -# -# Enables local caching of Variables, when parsing DAGs only. -# Using this option can make dag parsing faster if Variables are used in top level code, at the expense -# of longer propagation time for changes. -# Please note that this cache concerns only the DAG parsing step. There is no caching in place when DAG -# tasks are run. -# -# Variable: AIRFLOW__SECRETS__USE_CACHE -# -use_cache = False - -# .. note:: |experimental| -# -# When the cache is enabled, this is the duration for which we consider an entry in the cache to be -# valid. Entries are refreshed if they are older than this many seconds. -# It means that when the cache is enabled, this is the maximum amount of time you need to wait to see a -# Variable change take effect. -# -# Variable: AIRFLOW__SECRETS__CACHE_TTL_SECONDS -# -cache_ttl_seconds = 900 - -[cli] -# In what way should the cli access the API. The LocalClient will use the -# database directly, while the json_client will use the api running on the -# webserver -# -# Variable: AIRFLOW__CLI__API_CLIENT -# -api_client = airflow.api.client.local_client - -# If you set web_server_url_prefix, do NOT forget to append it here, ex: -# ``endpoint_url = http://localhost:8080/myroot`` -# So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` -# -# Variable: AIRFLOW__CLI__ENDPOINT_URL -# -endpoint_url = http://localhost:8080 - -[debug] -# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first -# failed task. Helpful for debugging purposes. -# -# Variable: AIRFLOW__DEBUG__FAIL_FAST -# -fail_fast = False - -[api] -# Enables the deprecated experimental API. Please note that these API endpoints do not have -# access control. An authenticated user has full access. -# -# .. warning:: -# -# This `Experimental REST API -# `__ is -# deprecated since version 2.0. Please consider using -# `the Stable REST API -# `__. -# For more information on migration, see -# `RELEASE_NOTES.rst `_ -# -# Variable: AIRFLOW__API__ENABLE_EXPERIMENTAL_API -# -enable_experimental_api = False - -# Comma separated list of auth backends to authenticate users of the API. See -# `Security: API -# `__ for possible values. -# ("airflow.api.auth.backend.default" allows all requests for historic reasons) -# -# Variable: AIRFLOW__API__AUTH_BACKENDS -# -auth_backends = airflow.api.auth.backend.session - -# Used to set the maximum page limit for API requests. If limit passed as param -# is greater than maximum page limit, it will be ignored and maximum page limit value -# will be set as the limit -# -# Variable: AIRFLOW__API__MAXIMUM_PAGE_LIMIT -# -maximum_page_limit = 100 - -# Used to set the default page limit when limit param is zero or not provided in API -# requests. Otherwise if positive integer is passed in the API requests as limit, the -# smallest number of user given limit or maximum page limit is taken as limit. -# -# Variable: AIRFLOW__API__FALLBACK_PAGE_LIMIT -# -fallback_page_limit = 100 - -# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. -# -# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com -# -# Variable: AIRFLOW__API__GOOGLE_OAUTH2_AUDIENCE -# -google_oauth2_audience = - -# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on -# `the Application Default Credentials -# `__ will -# be used. -# -# Example: google_key_path = /files/service-account-json -# -# Variable: AIRFLOW__API__GOOGLE_KEY_PATH -# -google_key_path = - -# Used in response to a preflight request to indicate which HTTP -# headers can be used when making the actual request. This header is -# the server side response to the browser's -# Access-Control-Request-Headers header. -# -# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_HEADERS -# -access_control_allow_headers = - -# Specifies the method or methods allowed when accessing the resource. -# -# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_METHODS -# -access_control_allow_methods = - -# Indicates whether the response can be shared with requesting code from the given origins. -# Separate URLs with space. -# -# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_ORIGINS -# -access_control_allow_origins = - -# Indicates whether the **xcomEntries** endpoint supports the **deserialize** -# flag. If set to ``False``, setting this flag in a request would result in a -# 400 Bad Request error. -# -# Variable: AIRFLOW__API__ENABLE_XCOM_DESERIALIZE_SUPPORT -# -enable_xcom_deserialize_support = False - -[lineage] -# what lineage backend to use -# -# Variable: AIRFLOW__LINEAGE__BACKEND -# -backend = - -[operators] -# The default owner assigned to each new operator, unless -# provided explicitly or passed via ``default_args`` -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_OWNER -# -default_owner = airflow - -# The default value of attribute "deferrable" in operators and sensors. -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_DEFERRABLE -# -default_deferrable = false - -# Indicates the default number of CPU units allocated to each operator when no specific CPU request -# is specified in the operator's configuration -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_CPUS -# -default_cpus = 1 - -# Indicates the default number of RAM allocated to each operator when no specific RAM request -# is specified in the operator's configuration -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_RAM -# -default_ram = 512 - -# Indicates the default number of disk storage allocated to each operator when no specific disk request -# is specified in the operator's configuration -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_DISK -# -default_disk = 512 - -# Indicates the default number of GPUs allocated to each operator when no specific GPUs request -# is specified in the operator's configuration -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_GPUS -# -default_gpus = 0 - -# Default queue that tasks get assigned to and that worker listen on. -# -# Variable: AIRFLOW__OPERATORS__DEFAULT_QUEUE -# -default_queue = default - -# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. -# If set to ``False``, an exception will be thrown, -# otherwise only the console message will be displayed. -# -# Variable: AIRFLOW__OPERATORS__ALLOW_ILLEGAL_ARGUMENTS -# -allow_illegal_arguments = False - -[webserver] -# The message displayed when a user attempts to execute actions beyond their authorised privileges. -# -# Variable: AIRFLOW__WEBSERVER__ACCESS_DENIED_MESSAGE -# -access_denied_message = Access is Denied - -# Path of webserver config file used for configuring the webserver parameters -# -# Variable: AIRFLOW__WEBSERVER__CONFIG_FILE -# -config_file = /shared_dir/airflow/webserver_config.py - -# The base url of your website: Airflow cannot guess what domain or CNAME you are using. -# This is used to create links in the Log Url column in the Browse - Task Instances menu, -# as well as in any automated emails sent by Airflow that contain links to your webserver. -# -# Variable: AIRFLOW__WEBSERVER__BASE_URL -# -base_url = http://localhost:8080 - -# Default timezone to display all dates in the UI, can be UTC, system, or -# any IANA timezone string (e.g. **Europe/Amsterdam**). If left empty the -# default value of core/default_timezone will be used -# -# Example: default_ui_timezone = America/New_York -# -# Variable: AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE -# -default_ui_timezone = UTC - -# The ip specified when starting the web server -# -# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_HOST -# -web_server_host = 0.0.0.0 - -# The port on which to run the web server -# -# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_PORT -# -web_server_port = 8080 - -# Paths to the SSL certificate and key for the web server. When both are -# provided SSL will be enabled. This does not change the web server port. -# -# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_CERT -# -web_server_ssl_cert = - -# Paths to the SSL certificate and key for the web server. When both are -# provided SSL will be enabled. This does not change the web server port. -# -# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_KEY -# -web_server_ssl_key = - -# The type of backend used to store web session data, can be ``database`` or ``securecookie``. For the -# ``database`` backend, sessions are store in the database and they can be -# managed there (for example when you reset password of the user, all sessions for that user are -# deleted). For the ``securecookie`` backend, sessions are stored in encrypted cookies on the client -# side. The ``securecookie`` mechanism is 'lighter' than database backend, but sessions are not deleted -# when you reset password of the user, which means that other than waiting for expiry time, the only -# way to invalidate all sessions for a user is to change secret_key and restart webserver (which -# also invalidates and logs out all other user's sessions). -# -# When you are using ``database`` backend, make sure to keep your database session table small -# by periodically running ``airflow db clean --table session`` command, especially if you have -# automated API calls that will create a new session for each call rather than reuse the sessions -# stored in browser cookies. -# -# Example: session_backend = securecookie -# -# Variable: AIRFLOW__WEBSERVER__SESSION_BACKEND -# -session_backend = database - -# Number of seconds the webserver waits before killing gunicorn master that doesn't respond -# -# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_MASTER_TIMEOUT -# -web_server_master_timeout = 120 - -# Number of seconds the gunicorn webserver waits before timing out on a worker -# -# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_WORKER_TIMEOUT -# -web_server_worker_timeout = 120 - -# Number of workers to refresh at a time. When set to 0, worker refresh is -# disabled. When nonzero, airflow periodically refreshes webserver workers by -# bringing up new ones and killing old ones. -# -# Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_BATCH_SIZE -# -worker_refresh_batch_size = 1 - -# Number of seconds to wait before refreshing a batch of workers. -# -# Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL -# -worker_refresh_interval = 6000 - -# If set to ``True``, Airflow will track files in plugins_folder directory. When it detects changes, -# then reload the gunicorn. If set to ``True``, gunicorn starts without preloading, which is slower, -# uses more memory, and may cause race conditions. Avoid setting this to ``True`` in production. -# -# Variable: AIRFLOW__WEBSERVER__RELOAD_ON_PLUGIN_CHANGE -# -reload_on_plugin_change = False - -# Secret key used to run your flask app. It should be as random as possible. However, when running -# more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise -# one of them will error with "CSRF session token is missing". -# The webserver key is also used to authorize requests to Celery workers when logs are retrieved. -# The token generated using the secret key has a short expiry time though - make sure that time on -# ALL the machines that you run airflow components on is synchronized (for example using ntpd) -# otherwise you might get "forbidden" errors when the logs are accessed. -# -# Variable: AIRFLOW__WEBSERVER__SECRET_KEY -# -secret_key = LtXATvV6AP0LJYsI6tVfnA== - -# Number of workers to run the Gunicorn web server -# -# Variable: AIRFLOW__WEBSERVER__WORKERS -# -workers = 4 - -# The worker class gunicorn should use. Choices include -# ``sync`` (default), ``eventlet``, ``gevent``. -# -# .. warning:: -# -# When using ``gevent`` you might also want to set the ``_AIRFLOW_PATCH_GEVENT`` -# environment variable to ``"1"`` to make sure gevent patching is done as early as possible. -# -# Be careful to set ``_AIRFLOW_PATCH_GEVENT`` only on the web server as gevent patching may -# affect the scheduler behavior via the ``multiprocessing`` sockets module and cause crash. -# -# See related Issues / PRs for more details: -# -# * https://github.com/benoitc/gunicorn/issues/2796 -# * https://github.com/apache/airflow/issues/8212 -# * https://github.com/apache/airflow/pull/28283 -# -# Variable: AIRFLOW__WEBSERVER__WORKER_CLASS -# -worker_class = sync - -# Log files for the gunicorn webserver. '-' means log to stderr. -# -# Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFILE -# -access_logfile = - - -# Log files for the gunicorn webserver. '-' means log to stderr. -# -# Variable: AIRFLOW__WEBSERVER__ERROR_LOGFILE -# -error_logfile = - - -# Access log format for gunicorn webserver. -# default format is ``%%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"`` -# See `Gunicorn Settings: 'access_log_format' Reference -# `__ for more details -# -# Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFORMAT -# -access_logformat = - -# Expose the configuration file in the web server. Set to ``non-sensitive-only`` to show all values -# except those that have security implications. ``True`` shows all values. ``False`` hides the -# configuration completely. -# -# Variable: AIRFLOW__WEBSERVER__EXPOSE_CONFIG -# -expose_config = False - -# Expose hostname in the web server -# -# Variable: AIRFLOW__WEBSERVER__EXPOSE_HOSTNAME -# -expose_hostname = False - -# Expose stacktrace in the web server -# -# Variable: AIRFLOW__WEBSERVER__EXPOSE_STACKTRACE -# -expose_stacktrace = False - -# Default DAG view. Valid values are: ``grid``, ``graph``, ``duration``, ``gantt``, ``landing_times`` -# -# Variable: AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW -# -dag_default_view = grid - -# Default DAG orientation. Valid values are: -# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) -# -# Variable: AIRFLOW__WEBSERVER__DAG_ORIENTATION -# -dag_orientation = LR - -# Sorting order in grid view. Valid values are: ``topological``, ``hierarchical_alphabetical`` -# -# Variable: AIRFLOW__WEBSERVER__GRID_VIEW_SORTING_ORDER -# -grid_view_sorting_order = topological - -# The amount of time (in secs) webserver will wait for initial handshake -# while fetching logs from other worker machine -# -# Variable: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC -# -log_fetch_timeout_sec = 5 - -# Time interval (in secs) to wait before next log fetching. -# -# Variable: AIRFLOW__WEBSERVER__LOG_FETCH_DELAY_SEC -# -log_fetch_delay_sec = 2 - -# Distance away from page bottom to enable auto tailing. -# -# Variable: AIRFLOW__WEBSERVER__LOG_AUTO_TAILING_OFFSET -# -log_auto_tailing_offset = 30 - -# Animation speed for auto tailing log display. -# -# Variable: AIRFLOW__WEBSERVER__LOG_ANIMATION_SPEED -# -log_animation_speed = 1000 - -# By default, the webserver shows paused DAGs. Flip this to hide paused -# DAGs by default -# -# Variable: AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT -# -hide_paused_dags_by_default = False - -# Consistent page size across all listing views in the UI -# -# Variable: AIRFLOW__WEBSERVER__PAGE_SIZE -# -page_size = 100 - -# Define the color of navigation bar -# -# Variable: AIRFLOW__WEBSERVER__NAVBAR_COLOR -# -navbar_color = #fff - -# Define the color of text in the navigation bar -# -# Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_COLOR -# -navbar_text_color = #51504f - -# Define the color of navigation bar links when hovered -# -# Variable: AIRFLOW__WEBSERVER__NAVBAR_HOVER_COLOR -# -navbar_hover_color = #eee - -# Define the color of text in the navigation bar when hovered -# -# Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_HOVER_COLOR -# -navbar_text_hover_color = #51504f - -# Define the color of the logo text -# -# Variable: AIRFLOW__WEBSERVER__NAVBAR_LOGO_TEXT_COLOR -# -navbar_logo_text_color = #51504f - -# Default dagrun to show in UI -# -# Variable: AIRFLOW__WEBSERVER__DEFAULT_DAG_RUN_DISPLAY_NUMBER -# -default_dag_run_display_number = 25 - -# Enable werkzeug ``ProxyFix`` middleware for reverse proxy -# -# Variable: AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX -# -enable_proxy_fix = False - -# Number of values to trust for ``X-Forwarded-For``. -# See `Werkzeug: X-Forwarded-For Proxy Fix -# `__ for more details. -# -# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_FOR -# -proxy_fix_x_for = 1 - -# Number of values to trust for ``X-Forwarded-Proto``. -# See `Werkzeug: X-Forwarded-For Proxy Fix -# `__ for more details. -# -# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PROTO -# -proxy_fix_x_proto = 1 - -# Number of values to trust for ``X-Forwarded-Host``. -# See `Werkzeug: X-Forwarded-For Proxy Fix -# `__ for more details. -# -# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_HOST -# -proxy_fix_x_host = 1 - -# Number of values to trust for ``X-Forwarded-Port``. -# See `Werkzeug: X-Forwarded-For Proxy Fix -# `__ for more details. -# -# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PORT -# -proxy_fix_x_port = 1 - -# Number of values to trust for ``X-Forwarded-Prefix``. -# See `Werkzeug: X-Forwarded-For Proxy Fix -# `__ for more details. -# -# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PREFIX -# -proxy_fix_x_prefix = 1 - -# Set secure flag on session cookie -# -# Variable: AIRFLOW__WEBSERVER__COOKIE_SECURE -# -cookie_secure = False - -# Set samesite policy on session cookie -# -# Variable: AIRFLOW__WEBSERVER__COOKIE_SAMESITE -# -cookie_samesite = Lax - -# Default setting for wrap toggle on DAG code and TI log views. -# -# Variable: AIRFLOW__WEBSERVER__DEFAULT_WRAP -# -default_wrap = False - -# Allow the UI to be rendered in a frame -# -# Variable: AIRFLOW__WEBSERVER__X_FRAME_ENABLED -# -x_frame_enabled = True - -# Send anonymous user activity to your analytics tool -# choose from ``google_analytics``, ``segment``, ``metarouter``, or ``matomo`` -# -# Variable: AIRFLOW__WEBSERVER__ANALYTICS_TOOL -# -# analytics_tool = - -# Unique ID of your account in the analytics tool -# -# Variable: AIRFLOW__WEBSERVER__ANALYTICS_ID -# -# analytics_id = - -# Your instances url, only applicable to Matomo. -# -# Example: analytics_url = https://your.matomo.instance.com/ -# -# Variable: AIRFLOW__WEBSERVER__ANALYTICS_URL -# -# analytics_url = - -# 'Recent Tasks' stats will show for old DagRuns if set -# -# Variable: AIRFLOW__WEBSERVER__SHOW_RECENT_STATS_FOR_COMPLETED_RUNS -# -show_recent_stats_for_completed_runs = True - -# The UI cookie lifetime in minutes. User will be logged out from UI after -# ``[webserver] session_lifetime_minutes`` of non-activity -# -# Variable: AIRFLOW__WEBSERVER__SESSION_LIFETIME_MINUTES -# -session_lifetime_minutes = 43200 - -# Sets a custom page title for the DAGs overview page and site title for all pages -# -# Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME -# -# instance_name = - -# Whether the custom page title for the DAGs overview page contains any Markup language -# -# Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME_HAS_MARKUP -# -instance_name_has_markup = False - -# How frequently, in seconds, the DAG data will auto-refresh in graph or grid view -# when auto-refresh is turned on -# -# Variable: AIRFLOW__WEBSERVER__AUTO_REFRESH_INTERVAL -# -auto_refresh_interval = 3 - -# Boolean for displaying warning for publicly viewable deployment -# -# Variable: AIRFLOW__WEBSERVER__WARN_DEPLOYMENT_EXPOSURE -# -warn_deployment_exposure = True - -# Comma separated string of view events to exclude from dag audit view. -# All other events will be added minus the ones passed here. -# The audit logs in the db will not be affected by this parameter. -# -# Example: audit_view_excluded_events = cli_task_run,running,success -# -# Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_EXCLUDED_EVENTS -# -# audit_view_excluded_events = - -# Comma separated string of view events to include in dag audit view. -# If passed, only these events will populate the dag audit view. -# The audit logs in the db will not be affected by this parameter. -# -# Example: audit_view_included_events = dagrun_cleared,failed -# -# Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_INCLUDED_EVENTS -# -# audit_view_included_events = - -# Boolean for running SwaggerUI in the webserver. -# -# Variable: AIRFLOW__WEBSERVER__ENABLE_SWAGGER_UI -# -enable_swagger_ui = True - -# Boolean for running Internal API in the webserver. -# -# Variable: AIRFLOW__WEBSERVER__RUN_INTERNAL_API -# -run_internal_api = False - -# The caching algorithm used by the webserver. Must be a valid hashlib function name. -# -# Example: caching_hash_method = sha256 -# -# Variable: AIRFLOW__WEBSERVER__CACHING_HASH_METHOD -# -caching_hash_method = md5 - -# Behavior of the trigger DAG run button for DAGs without params. ``False`` to skip and trigger -# without displaying a form to add a **dag_run.conf**, ``True`` to always display the form. -# The form is displayed always if parameters are defined. -# -# Variable: AIRFLOW__WEBSERVER__SHOW_TRIGGER_FORM_IF_NO_PARAMS -# -show_trigger_form_if_no_params = False - -# Number of recent DAG run configurations in the selector on the trigger web form. -# -# Example: num_recent_configurations_for_trigger = 10 -# -# Variable: AIRFLOW__WEBSERVER__NUM_RECENT_CONFIGURATIONS_FOR_TRIGGER -# -num_recent_configurations_for_trigger = 5 - -# A DAG author is able to provide any raw HTML into ``doc_md`` or params description in -# ``description_md`` for text formatting. This is including potentially unsafe javascript. -# Displaying the DAG or trigger form in web UI provides the DAG author the potential to -# inject malicious code into clients browsers. To ensure the web UI is safe by default, -# raw HTML is disabled by default. If you trust your DAG authors, you can enable HTML -# support in markdown by setting this option to ``True``. -# -# This parameter also enables the deprecated fields ``description_html`` and -# ``custom_html_form`` in DAG params until the feature is removed in a future version. -# -# Example: allow_raw_html_descriptions = False -# -# Variable: AIRFLOW__WEBSERVER__ALLOW_RAW_HTML_DESCRIPTIONS -# -allow_raw_html_descriptions = False - -# The maximum size of the request payload (in MB) that can be sent. -# -# Variable: AIRFLOW__WEBSERVER__ALLOWED_PAYLOAD_SIZE -# -allowed_payload_size = 1.0 - -# Require confirmation when changing a DAG in the web UI. This is to prevent accidental changes -# to a DAG that may be running on sensitive environments like production. -# When set to ``True``, confirmation dialog will be shown when a user tries to Pause/Unpause, -# Trigger a DAG -# -# Variable: AIRFLOW__WEBSERVER__REQUIRE_CONFIRMATION_DAG_CHANGE -# -require_confirmation_dag_change = False - -[email] -# Configuration email backend and whether to -# send email alerts on retry or failure - -# Email backend to use -# -# Variable: AIRFLOW__EMAIL__EMAIL_BACKEND -# -email_backend = airflow.utils.email.send_email_smtp - -# Email connection to use -# -# Variable: AIRFLOW__EMAIL__EMAIL_CONN_ID -# -email_conn_id = smtp_default - -# Whether email alerts should be sent when a task is retried -# -# Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_RETRY -# -default_email_on_retry = True - -# Whether email alerts should be sent when a task failed -# -# Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_FAILURE -# -default_email_on_failure = True - -# File that will be used as the template for Email subject (which will be rendered using Jinja2). -# If not set, Airflow uses a base template. -# -# Example: subject_template = /path/to/my_subject_template_file -# -# Variable: AIRFLOW__EMAIL__SUBJECT_TEMPLATE -# -# subject_template = - -# File that will be used as the template for Email content (which will be rendered using Jinja2). -# If not set, Airflow uses a base template. -# -# Example: html_content_template = /path/to/my_html_content_template_file -# -# Variable: AIRFLOW__EMAIL__HTML_CONTENT_TEMPLATE -# -# html_content_template = - -# Email address that will be used as sender address. -# It can either be raw email or the complete address in a format ``Sender Name `` -# -# Example: from_email = Airflow -# -# Variable: AIRFLOW__EMAIL__FROM_EMAIL -# -# from_email = - -# ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default" -# which sets it to ``ssl.create_default_context()`` which provides the right balance between -# compatibility and security, it however requires that certificates in your operating system are -# updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public -# keys installed on your machines. You can switch it to "none" if you want to disable checking -# of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks -# if your infrastructure is not sufficiently secured. It should only be set temporarily while you -# are fixing your certificate configuration. This can be typically done by upgrading to newer -# version of the operating system you run Airflow components on,by upgrading/refreshing proper -# certificates in the OS or by updating certificates for your mail servers. -# -# Example: ssl_context = default -# -# Variable: AIRFLOW__EMAIL__SSL_CONTEXT -# -ssl_context = default - -[smtp] -# If you want airflow to send emails on retries, failure, and you want to use -# the airflow.utils.email.send_email_smtp function, you have to configure an -# smtp server here - -# Specifies the host server address used by Airflow when sending out email notifications via SMTP. -# -# Variable: AIRFLOW__SMTP__SMTP_HOST -# -smtp_host = localhost - -# Determines whether to use the STARTTLS command when connecting to the SMTP server. -# -# Variable: AIRFLOW__SMTP__SMTP_STARTTLS -# -smtp_starttls = True - -# Determines whether to use an SSL connection when talking to the SMTP server. -# -# Variable: AIRFLOW__SMTP__SMTP_SSL -# -smtp_ssl = False - -# Username to authenticate when connecting to smtp server. -# -# Example: smtp_user = airflow -# -# Variable: AIRFLOW__SMTP__SMTP_USER -# -# smtp_user = - -# Password to authenticate when connecting to smtp server. -# -# Example: smtp_password = airflow -# -# Variable: AIRFLOW__SMTP__SMTP_PASSWORD -# -# smtp_password = - -# Defines the port number on which Airflow connects to the SMTP server to send email notifications. -# -# Variable: AIRFLOW__SMTP__SMTP_PORT -# -smtp_port = 25 - -# Specifies the default **from** email address used when Airflow sends email notifications. -# -# Variable: AIRFLOW__SMTP__SMTP_MAIL_FROM -# -smtp_mail_from = airflow@example.com - -# Determines the maximum time (in seconds) the Apache Airflow system will wait for a -# connection to the SMTP server to be established. -# -# Variable: AIRFLOW__SMTP__SMTP_TIMEOUT -# -smtp_timeout = 30 - -# Defines the maximum number of times Airflow will attempt to connect to the SMTP server. -# -# Variable: AIRFLOW__SMTP__SMTP_RETRY_LIMIT -# -smtp_retry_limit = 5 - -[sentry] -# `Sentry `__ integration. Here you can supply -# additional configuration options based on the Python platform. -# See `Python / Configuration / Basic Options -# `__ for more details. -# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, -# ``ignore_errors``, ``before_breadcrumb``, ``transport``. - -# Enable error reporting to Sentry -# -# Variable: AIRFLOW__SENTRY__SENTRY_ON -# -sentry_on = false - -# -# Variable: AIRFLOW__SENTRY__SENTRY_DSN -# -sentry_dsn = - -# Dotted path to a before_send function that the sentry SDK should be configured to use. -# -# Variable: AIRFLOW__SENTRY__BEFORE_SEND -# -# before_send = - -[scheduler] -# Task instances listen for external kill signal (when you clear tasks -# from the CLI or the UI), this defines the frequency at which they should -# listen (in seconds). -# -# Variable: AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC -# -job_heartbeat_sec = 5 - -# The scheduler constantly tries to trigger new tasks (look at the -# scheduler section in the docs for more information). This defines -# how often the scheduler should run (in seconds). -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC -# -scheduler_heartbeat_sec = 5 - -# The frequency (in seconds) at which the LocalTaskJob should send heartbeat signals to the -# scheduler to notify it's still alive. If this value is set to 0, the heartbeat interval will default -# to the value of ``[scheduler] scheduler_zombie_task_threshold``. -# -# Variable: AIRFLOW__SCHEDULER__LOCAL_TASK_JOB_HEARTBEAT_SEC -# -local_task_job_heartbeat_sec = 0 - -# The number of times to try to schedule each DAG file -# -1 indicates unlimited number -# -# Variable: AIRFLOW__SCHEDULER__NUM_RUNS -# -num_runs = -1 - -# Controls how long the scheduler will sleep between loops, but if there was nothing to do -# in the loop. i.e. if it scheduled something then it will start the next loop -# iteration straight away. -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULER_IDLE_SLEEP_TIME -# -scheduler_idle_sleep_time = 1 - -# Number of seconds after which a DAG file is parsed. The DAG file is parsed every -# ``[scheduler] min_file_process_interval`` number of seconds. Updates to DAGs are reflected after -# this interval. Keeping this number low will increase CPU usage. -# -# Variable: AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL -# -min_file_process_interval = 30 - -# How often (in seconds) to check for stale DAGs (DAGs which are no longer present in -# the expected files) which should be deactivated, as well as datasets that are no longer -# referenced and should be marked as orphaned. -# -# Variable: AIRFLOW__SCHEDULER__PARSING_CLEANUP_INTERVAL -# -parsing_cleanup_interval = 60 - -# How long (in seconds) to wait after we have re-parsed a DAG file before deactivating stale -# DAGs (DAGs which are no longer present in the expected files). The reason why we need -# this threshold is to account for the time between when the file is parsed and when the -# DAG is loaded. The absolute maximum that this could take is ``[core] dag_file_processor_timeout``, -# but when you have a long timeout configured, it results in a significant delay in the -# deactivation of stale dags. -# -# Variable: AIRFLOW__SCHEDULER__STALE_DAG_THRESHOLD -# -stale_dag_threshold = 50 - -# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. -# -# Variable: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL -# -dag_dir_list_interval = 300 - -# How often should stats be printed to the logs. Setting to 0 will disable printing stats -# -# Variable: AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL -# -print_stats_interval = 30 - -# How often (in seconds) should pool usage stats be sent to StatsD (if statsd_on is enabled) -# -# Variable: AIRFLOW__SCHEDULER__POOL_METRICS_INTERVAL -# -pool_metrics_interval = 5.0 - -# If the last scheduler heartbeat happened more than ``[scheduler] scheduler_health_check_threshold`` -# ago (in seconds), scheduler is considered unhealthy. -# This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI -# for SchedulerJob. -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_THRESHOLD -# -scheduler_health_check_threshold = 30 - -# When you start a scheduler, airflow starts a tiny web server -# subprocess to serve a health check if this is set to ``True`` -# -# Variable: AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK -# -enable_health_check = False - -# When you start a scheduler, airflow starts a tiny web server -# subprocess to serve a health check on this host -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_HOST -# -scheduler_health_check_server_host = 0.0.0.0 - -# When you start a scheduler, airflow starts a tiny web server -# subprocess to serve a health check on this port -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_PORT -# -scheduler_health_check_server_port = 8974 - -# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs -# -# Variable: AIRFLOW__SCHEDULER__ORPHANED_TASKS_CHECK_INTERVAL -# -orphaned_tasks_check_interval = 300.0 - -# Determines the directory where logs for the child processes of the scheduler will be stored -# -# Variable: AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY -# -child_process_log_directory = /shared_dir/airflow/logs/scheduler - -# Local task jobs periodically heartbeat to the DB. If the job has -# not heartbeat in this many seconds, the scheduler will mark the -# associated task instance as failed and will re-schedule the task. -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULER_ZOMBIE_TASK_THRESHOLD -# -scheduler_zombie_task_threshold = 300 - -# How often (in seconds) should the scheduler check for zombie tasks. -# -# Variable: AIRFLOW__SCHEDULER__ZOMBIE_DETECTION_INTERVAL -# -zombie_detection_interval = 10.0 - -# Turn off scheduler catchup by setting this to ``False``. -# Default behavior is unchanged and -# Command Line Backfills still work, but the scheduler -# will not do scheduler catchup if this is ``False``, -# however it can be set on a per DAG basis in the -# DAG definition (catchup) -# -# Variable: AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT -# -catchup_by_default = True - -# Setting this to ``True`` will make first task instance of a task -# ignore depends_on_past setting. A task instance will be considered -# as the first task instance of a task when there is no task instance -# in the DB with an execution_date earlier than it., i.e. no manual marking -# success will be needed for a newly added task to be scheduled. -# -# Variable: AIRFLOW__SCHEDULER__IGNORE_FIRST_DEPENDS_ON_PAST_BY_DEFAULT -# -ignore_first_depends_on_past_by_default = True - -# This changes the batch size of queries in the scheduling main loop. -# This should not be greater than ``[core] parallelism``. -# If this is too high, SQL query performance may be impacted by -# complexity of query predicate, and/or excessive locking. -# Additionally, you may hit the maximum allowable query length for your db. -# Set this to 0 to use the value of ``[core] parallelism`` -# -# Variable: AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY -# -max_tis_per_query = 16 - -# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. -# If this is set to ``False`` then you should not run more than a single -# scheduler at once -# -# Variable: AIRFLOW__SCHEDULER__USE_ROW_LEVEL_LOCKING -# -use_row_level_locking = True - -# Max number of DAGs to create DagRuns for per scheduler loop. -# -# Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_TO_CREATE_PER_LOOP -# -max_dagruns_to_create_per_loop = 10 - -# How many DagRuns should a scheduler examine (and lock) when scheduling -# and queuing tasks. -# -# Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_PER_LOOP_TO_SCHEDULE -# -max_dagruns_per_loop_to_schedule = 20 - -# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the -# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other -# dags in some circumstances -# -# Variable: AIRFLOW__SCHEDULER__SCHEDULE_AFTER_TASK_EXECUTION -# -schedule_after_task_execution = True - -# The scheduler reads dag files to extract the airflow modules that are going to be used, -# and imports them ahead of time to avoid having to re-do it for each parsing process. -# This flag can be set to ``False`` to disable this behavior in case an airflow module needs -# to be freshly imported each time (at the cost of increased DAG parsing time). -# -# Variable: AIRFLOW__SCHEDULER__PARSING_PRE_IMPORT_MODULES -# -parsing_pre_import_modules = True - -# The scheduler can run multiple processes in parallel to parse dags. -# This defines how many processes will run. -# -# Variable: AIRFLOW__SCHEDULER__PARSING_PROCESSES -# -parsing_processes = 2 - -# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``. -# The scheduler will list and sort the dag files to decide the parsing order. -# -# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the -# recently modified DAGs first. -# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the -# same host. This is useful when running with Scheduler in HA mode where each scheduler can -# parse different DAG files. -# * ``alphabetical``: Sort by filename -# -# Variable: AIRFLOW__SCHEDULER__FILE_PARSING_SORT_MODE -# -file_parsing_sort_mode = modified_time - -# Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler -# job. -# -# Variable: AIRFLOW__SCHEDULER__STANDALONE_DAG_PROCESSOR -# -standalone_dag_processor = False - -# Only applicable if ``[scheduler] standalone_dag_processor`` is true and callbacks are stored -# in database. Contains maximum number of callbacks that are fetched during a single loop. -# -# Variable: AIRFLOW__SCHEDULER__MAX_CALLBACKS_PER_LOOP -# -max_callbacks_per_loop = 20 - -# Only applicable if ``[scheduler] standalone_dag_processor`` is true. -# Time in seconds after which dags, which were not updated by Dag Processor are deactivated. -# -# Variable: AIRFLOW__SCHEDULER__DAG_STALE_NOT_SEEN_DURATION -# -dag_stale_not_seen_duration = 600 - -# Turn off scheduler use of cron intervals by setting this to ``False``. -# DAGs submitted manually in the web UI or with trigger_dag will still run. -# -# Variable: AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE -# -use_job_schedule = True - -# Allow externally triggered DagRuns for Execution Dates in the future -# Only has effect if schedule_interval is set to None in DAG -# -# Variable: AIRFLOW__SCHEDULER__ALLOW_TRIGGER_IN_FUTURE -# -allow_trigger_in_future = False - -# How often to check for expired trigger requests that have not run yet. -# -# Variable: AIRFLOW__SCHEDULER__TRIGGER_TIMEOUT_CHECK_INTERVAL -# -trigger_timeout_check_interval = 15 - -# Amount of time a task can be in the queued state before being retried or set to failed. -# -# Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT -# -task_queued_timeout = 600.0 - -# How often to check for tasks that have been in the queued state for -# longer than ``[scheduler] task_queued_timeout``. -# -# Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT_CHECK_INTERVAL -# -task_queued_timeout_check_interval = 120.0 - -# The run_id pattern used to verify the validity of user input to the run_id parameter when -# triggering a DAG. This pattern cannot change the pattern used by scheduler to generate run_id -# for scheduled DAG runs or DAG runs triggered without changing the run_id parameter. -# -# Variable: AIRFLOW__SCHEDULER__ALLOWED_RUN_ID_PATTERN -# -allowed_run_id_pattern = ^[A-Za-z0-9_.~:+-]+$ - -# Whether to create DAG runs that span an interval or one single point in time for cron schedules, when -# a cron string is provided to ``schedule`` argument of a DAG. -# -# * ``True``: **CronDataIntervalTimetable** is used, which is suitable -# for DAGs with well-defined data interval. You get contiguous intervals from the end of the previous -# interval up to the scheduled datetime. -# * ``False``: **CronTriggerTimetable** is used, which is closer to the behavior of cron itself. -# -# Notably, for **CronTriggerTimetable**, the logical date is the same as the time the DAG Run will -# try to schedule, while for **CronDataIntervalTimetable**, the logical date is the beginning of -# the data interval, but the DAG Run will try to schedule at the end of the data interval. -# -# Variable: AIRFLOW__SCHEDULER__CREATE_CRON_DATA_INTERVALS -# -create_cron_data_intervals = True - -[triggerer] -# How many triggers a single Triggerer will run at once, by default. -# -# Variable: AIRFLOW__TRIGGERER__DEFAULT_CAPACITY -# -default_capacity = 1000 - -# How often to heartbeat the Triggerer job to ensure it hasn't been killed. -# -# Variable: AIRFLOW__TRIGGERER__JOB_HEARTBEAT_SEC -# -job_heartbeat_sec = 5 - -# If the last triggerer heartbeat happened more than ``[triggerer] triggerer_health_check_threshold`` -# ago (in seconds), triggerer is considered unhealthy. -# This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI -# for TriggererJob. -# -# Variable: AIRFLOW__TRIGGERER__TRIGGERER_HEALTH_CHECK_THRESHOLD -# -triggerer_health_check_threshold = 30 - -[kerberos] -# Location of your ccache file once kinit has been performed. -# -# Variable: AIRFLOW__KERBEROS__CCACHE -# -ccache = /tmp/airflow_krb5_ccache - -# gets augmented with fqdn -# -# Variable: AIRFLOW__KERBEROS__PRINCIPAL -# -principal = airflow - -# Determines the frequency at which initialization or re-initialization processes occur. -# -# Variable: AIRFLOW__KERBEROS__REINIT_FREQUENCY -# -reinit_frequency = 3600 - -# Path to the kinit executable -# -# Variable: AIRFLOW__KERBEROS__KINIT_PATH -# -kinit_path = kinit - -# Designates the path to the Kerberos keytab file for the Airflow user -# -# Variable: AIRFLOW__KERBEROS__KEYTAB -# -keytab = airflow.keytab - -# Allow to disable ticket forwardability. -# -# Variable: AIRFLOW__KERBEROS__FORWARDABLE -# -forwardable = True - -# Allow to remove source IP from token, useful when using token behind NATted Docker host. -# -# Variable: AIRFLOW__KERBEROS__INCLUDE_IP -# -include_ip = True - -[sensors] -# Sensor default timeout, 7 days by default (7 * 24 * 60 * 60). -# -# Variable: AIRFLOW__SENSORS__DEFAULT_TIMEOUT -# -default_timeout = 604800 - -[usage_data_collection] -# Airflow integrates `Scarf `__ to collect basic platform and usage data -# during operation. This data assists Airflow maintainers in better understanding how Airflow is used. -# Insights gained from this telemetry are critical for prioritizing patches, minor releases, and -# security fixes. Additionally, this information supports key decisions related to the development road map. -# Check the FAQ doc for more information on what data is collected. -# -# Deployments can opt-out of analytics by setting the ``enabled`` option -# to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. -# Individual users can easily opt-out of analytics in various ways documented in the -# `Scarf Do Not Track docs `__. - -# Enable or disable usage data collection and sending. -# -# Variable: AIRFLOW__USAGE_DATA_COLLECTION__ENABLED -# -enabled = True - -[common.io] -# Common IO configuration section - -# Path to a location on object storage where XComs can be stored in url format. -# -# Example: xcom_objectstorage_path = s3://conn_id@bucket/path -# -# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_PATH -# -xcom_objectstorage_path = - -# Threshold in bytes for storing XComs in object storage. -1 means always store in the -# database. 0 means always store in object storage. Any positive number means -# it will be stored in object storage if the size of the value is greater than the threshold. -# -# Example: xcom_objectstorage_threshold = 1000000 -# -# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_THRESHOLD -# -xcom_objectstorage_threshold = -1 - -# Compression algorithm to use when storing XComs in object storage. Supported algorithms -# are a.o.: snappy, zip, gzip, bz2, and lzma. If not specified, no compression will be used. -# Note that the compression algorithm must be available in the Python installation (e.g. -# python-snappy for snappy). Zip, gz, bz2 are available by default. -# -# Example: xcom_objectstorage_compression = gz -# -# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_COMPRESSION -# -xcom_objectstorage_compression = - -[fab] -# This section contains configs specific to FAB provider. - -# Boolean for enabling rate limiting on authentication endpoints. -# -# Variable: AIRFLOW__FAB__AUTH_RATE_LIMITED -# -auth_rate_limited = True - -# Rate limit for authentication endpoints. -# -# Variable: AIRFLOW__FAB__AUTH_RATE_LIMIT -# -auth_rate_limit = 5 per 40 second - -# Update FAB permissions and sync security manager roles -# on webserver startup -# -# Variable: AIRFLOW__FAB__UPDATE_FAB_PERMS -# -update_fab_perms = True - -[imap] -# Options for IMAP provider. - -# ssl_context = - -[smtp_provider] -# Options for SMTP provider. - -# ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default" -# which sets it to ``ssl.create_default_context()`` which provides the right balance between -# compatibility and security, it however requires that certificates in your operating system are -# updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public -# keys installed on your machines. You can switch it to "none" if you want to disable checking -# of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks -# if your infrastructure is not sufficiently secured. It should only be set temporarily while you -# are fixing your certificate configuration. This can be typically done by upgrading to newer -# version of the operating system you run Airflow components on,by upgrading/refreshing proper -# certificates in the OS or by updating certificates for your mail servers. -# -# If you do not set this option explicitly, it will use Airflow "email.ssl_context" configuration, -# but if this configuration is not present, it will use "default" value. -# -# Example: ssl_context = default -# -# Variable: AIRFLOW__SMTP_PROVIDER__SSL_CONTEXT -# -# ssl_context = - -# Allows overriding of the standard templated email subject line when the SmtpNotifier is used. -# Must provide a path to the template. -# -# Example: templated_email_subject_path = path/to/override/email_subject.html -# -# Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_EMAIL_SUBJECT_PATH -# -# templated_email_subject_path = - -# Allows overriding of the standard templated email path when the SmtpNotifier is used. Must provide -# a path to the template. -# -# Example: templated_html_content_path = path/to/override/email.html -# -# Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_HTML_CONTENT_PATH -# -# templated_html_content_path = - diff --git a/env/alert_users.yaml b/env/alert_users.yaml new file mode 100644 index 0000000..22859bd --- /dev/null +++ b/env/alert_users.yaml @@ -0,0 +1,20 @@ +groups: + pipeline_dev: + description: Pipeline developers team + emails: + - riccardo.falco@inaf.it + - admin@localhost + # - dev2@cosi.it + + qa_team: + description: Quality Assurance + emails: + - riccardo.falco@inaf.it + # - qa@cosi.it + +rules: + - pattern: "ALERT_FAIL" + notify: ["pipeline_dev"] + + - pattern: "TriggerDagRunOperator" + notify: ["qa_team"] diff --git a/env/docker-compose.yaml b/env/docker-compose.yaml index 542c4ce..4686dab 100644 --- a/env/docker-compose.yaml +++ b/env/docker-compose.yaml @@ -1,35 +1,216 @@ +# Docker Compose file version (obsolete in newer versions, but kept for compatibility) +# version: "3.9" + +# Build arguments for Dockerfile (UID and GID) +x-build-args: &build-args + UID: ${UID:-} # TOEDIT: set the UID of the user in the container (default: 501) + GID: ${GID:-} # TOEDIT: set the GID of the user in the container (default: 20) + +x-common-env: &common-env + #-------- BOOTSTRAP ID user for container (can be overridden via .env) + DISPLAY: ${DISPLAY:-} + #-------- AIRFLOW Environment Variables + AIRFLOW_ADMIN_USERNAME: ${AIRFLOW_ADMIN_USERNAME:-admin} + AIRFLOW_ADMIN_EMAIL: ${AIRFLOW_ADMIN_EMAIL:-admin@localhost} + # Write here the secure password for airflow Web UI + AIRFLOW_ADMIN_PASSWORD: ${AIRFLOW_ADMIN_PASSWORD:-} # TOEDIT: set the password for the Airflow Web UI (default: cosipass) + #-------- Airflow home directory + AIRFLOW_HOME: /home/gamma/airflow + #-------- SMTP Settings for MailHog + # HOST_IP: Define once here, use everywhere. Can be overridden via .env file + # URLs are constructed dynamically in entrypoint-airflow.sh from HOST_IP and ports below + HOST_IP: ${HOST_IP:-merlin.iasfbo.inaf.it} # TOEDIT: set the IP address of the host (default: localhost) + # Ports for services (can be overridden via .env) + MAILHOG_WEBUI_PORT: ${MAILHOG_WEBUI_PORT:-8025} # TOEDIT: set the port for the MailHog Web UI (default: 8025) + AIRFLOW_WEBUI_PORT: ${AIRFLOW_WEBUI_PORT:-8080} # TOEDIT: set the port for the Airflow Web UI (default: 8080) + # URLs will be constructed in entrypoint-airflow.sh as: + # MAILHOG_WEBUI_URL=http://${HOST_IP}:${MAILHOG_WEBUI_PORT} + # COSIFLOW_HOME_URL=http://${HOST_IP}:${AIRFLOW_WEBUI_PORT}/heasarcbrowser +################################### +###### POSTGRES CONFIGURATION ###### +################################### +#-------- Username for Airflow DB + POSTGRES_USER: ${POSTGRES_USER:-airflow_user} +#-------- Database name for Airflow + POSTGRES_DB: ${POSTGRES_DB:-airflow_db} +#-------- Password for Airflow DB + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-secure_password} # Password for Airflow DB +################################### +###### MAILHOG CONFIGURATION ###### +################################### + #-------- AIRFLOW Alert Settings + ALERT_USERS_LIST_PATH: ${ALERT_USERS_LIST_PATH:-/home/gamma/env/alert_users.yaml} + #-------- SMTP Settings + ALERT_SMTP_SERVER: ${ALERT_SMTP_SERVER:-mailhog} + ALERT_EMAIL_SENDER: ${ALERT_EMAIL_SENDER:-donotreply@cosiflow.alert.errors.it} + ALERT_LOG_PATH: ${ALERT_LOG_PATH:-/home/gamma/workspace/log/data_pipeline.log} + #-------- SMTP Settings for Airflow + AIRFLOW__SMTP__SMTP_STARTTLS: ${AIRFLOW__SMTP__SMTP_STARTTLS:-False} + AIRFLOW__SMTP__SMTP_SSL: ${AIRFLOW__SMTP__SMTP_SSL:-False} +######################################## +###### DOCKER PROXY CONFIGURATION ###### +######################################## + #-------- Docker Socket Proxy + # Note: These are used in docker-proxy service. Defaults are applied inline. + DOCKER_PROXY_PORT: ${DOCKER_PROXY_PORT:-2375} + DOCKER_HOST: tcp://docker-proxy:${DOCKER_PROXY_PORT:-2375} + DOCKER_CONTAINERS: ${DOCKER_CONTAINERS:-1} # Allow listing/starting containers + DOCKER_IMAGES: ${DOCKER_IMAGES:-1} # Allow image pulling + DOCKER_POST: ${DOCKER_POST:-1} # Allow POST requests (create/start/stop) + DOCKER_NETWORKS: ${DOCKER_NETWORKS:-1} # Allow network operations + DOCKER_VOLUMES: ${DOCKER_VOLUMES:-1} # Allow volume operations +###################################### +###### COSI Directory Structure ###### +###################################### + #-------- Base directories + COSI_DATA_DIR: ${COSI_DATA_DIR:-/home/gamma/workspace/data} + COSI_INPUT_DIR: ${COSI_INPUT_DIR:-/home/gamma/workspace/data/input} + COSI_LOG_DIR: ${COSI_LOG_DIR:-/home/gamma/workspace/log} + #-------- Main data type directories + COSI_OBS_DIR: ${COSI_OBS_DIR:-/home/gamma/workspace/data/obs} + COSI_TRANSIENT_DIR: ${COSI_TRANSIENT_DIR:-/home/gamma/workspace/data/transient} + COSI_TRIGGER_DIR: ${COSI_TRIGGER_DIR:-/home/gamma/workspace/data/trigger} + COSI_MAPS_DIR: ${COSI_MAPS_DIR:-/home/gamma/workspace/data/maps} + COSI_SOURCE_DIR: ${COSI_SOURCE_DIR:-/home/gamma/workspace/data/source} services: + # --------------------------------------------------------------------------- + # POSTGRES DATABASE SERVICE + # --------------------------------------------------------------------------- postgres: - image: postgres + image: postgres:15 container_name: cosi_postgres + # The default Postgres image runs as user "postgres". + # You usually should NOT override this with your local UID/GID. + # user: "${UID}:${GID}" + environment: - - POSTGRES_USER=airflow_user - - POSTGRES_PASSWORD=secure_password - - POSTGRES_DB=airflow_db - #volumes: - # - ${HOME}/postgres_data:/var/lib/postgresql/data - #restart: always + # These values initialize the PostgreSQL database cluster. + # The first initialization only happens if the data volume is empty. + # Values come from x-common-env anchor (with defaults if .env not present) + - UID=${UID:-501} + - GID=${GID:-20} + - POSTGRES_USER=${POSTGRES_USER:-airflow_user} + - POSTGRES_DB=${POSTGRES_DB:-airflow_db} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-secure_password} + + # Health check to verify that PostgreSQL is accepting connections + healthcheck: + test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $${POSTGRES_USER} -d $${POSTGRES_DB}"] + interval: 5s # Run every 5 seconds + timeout: 5s # Fail if command takes longer than 5 seconds + retries: 10 # Mark container as unhealthy after 10 failed checks + + volumes: + # Local directory that persists the database files + # Commented line for a user-level data directory: + # - ${HOME}/postgres_data:/var/lib/postgresql/data + - ../data/postgres_data:/var/lib/postgresql/data + # restart: always # Uncomment for auto-restart on failure + + # --------------------------------------------------------------------------- + # AIRFLOW SERVICE + # --------------------------------------------------------------------------- airflow: - image: airflow:1.1.0 + image: cosiflow-airflow:native + build: + context: . # Build context for Dockerfile + dockerfile: Dockerfile.airflow + args: + <<: *build-args # Use UID and GID from x-build-args anchor container_name: cosi_airflow + + # Run the container as the "gamma" user created inside the image. + # This matches the user configured in your Dockerfile. + user: "gamma" + environment: - - AIRFLOW_HOME=/home/gamma/airflow - - DISPLAY=${DISPLAY} - - AIRFLOW__CORE__LOAD_EXAMPLES=False + <<: *common-env + # DISPLAY is included in common-env, but explicitly listed here for clarity + # All variables from x-common-env are inherited via the anchor + volumes: - - ../dags:/home/gamma//airflow/dags - - ./airflow.cfg.postgresql:/home/gamma/airflow/airflow.cfg + # DAGs, plugins, and pipeline directories are mounted so you can edit + # them live without rebuilding the image. + - ../dags:/home/gamma/airflow/dags + + # [Shared Storage] Pool of all COSI modules (mounted from workspace root) + # This allows hot-plugging new modules via symlinks without restarting. + - ../../:/home/gamma/airflow/modules_pool + + # [Docker] Socket mount removed -> We use TCP proxy now. + - ../plugins:/home/gamma/airflow/plugins + - ../pipeline:/home/gamma/airflow/pipeline + - ../callbacks:/home/gamma/airflow/callbacks + - ../modules:/home/gamma/airflow/modules + + # Mount the PostgreSQL-specific Airflow configuration + - ./airflow.cfg:/home/gamma/airflow/airflow.cfg + + # X11 socket for graphical display forwarding (if needed) - /tmp/.X11-unix:/tmp/.X11-unix:rw - - ${HOME}/cosiflow:/shared_dir + + # Shared directory between host and container + - ..:/shared_dir + + # Data workspace directory for processing large files + - ../data:/home/gamma/workspace/data + + ports: + - "8080:8080" # Airflow Web UI + - "28888:28888" # Jupyter Notebook (if enabled) + + depends_on: + postgres: + # Wait until the Postgres healthcheck passes before starting Airflow + condition: service_healthy + + # restart: always # Uncomment if you want automatic restarts + + # Custom entrypoint script that initializes Airflow DB and starts services + entrypoint: ["bash", "/home/gamma/entrypoint-airflow.sh"] + # Alternative entrypoint for debugging: + # entrypoint: ["tail", "-f", "/dev/null"] + + # --------------------------------------------------------------------------- + # DOCKER SOCKET PROXY (Sidecar for secure Docker access) + # --------------------------------------------------------------------------- + docker-proxy: + image: tecnativa/docker-socket-proxy:latest + container_name: cosi_docker_proxy + # platform: linux/amd64 # Uncomment if needed on ARM, but usually multi-arch + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro # Read-only mount for security! + environment: + # Use values from x-common-env anchor with defaults + - CONTAINERS=${DOCKER_CONTAINERS:-1} # Allow listing/starting containers + - IMAGES=${DOCKER_IMAGES:-1} # Allow image pulling + - POST=${DOCKER_POST:-1} # Allow POST requests (create/start/stop) + - NETWORKS=${DOCKER_NETWORKS:-1} # Allow network operations + - VOLUMES=${DOCKER_VOLUMES:-1} # Allow volume operations + ports: + # Expose Docker proxy port (default: 2375) + # Note: Using hardcoded port 2375 for compatibility. + # The proxy always listens on port 2375 inside the container. + # To change the host port, set DOCKER_PROXY_PORT env var and update this mapping. + - "127.0.0.1:2375:2375" # Expose only to localhost for security + # No 'user' set, it runs as root internally to access the socket, but acts as a proxy. + + # --------------------------------------------------------------------------- + # MAILHOG SERVICE (Fake SMTP server for local testing) + # --------------------------------------------------------------------------- + mailhog: + image: mailhog/mailhog + platform: linux/amd64 # Explicitly use AMD64 emulation on ARM Macs to silence warning + container_name: cosi_mailhog ports: - - "8080:8080" - - "28888:28888" #jupyter notebook - #restart: always - entrypoint: ["bash", "/home/gamma/entrypoint.sh"] - #entrypoint: ["tail", "-f", "/dev/null"] + - "1025:1025" # SMTP port for Airflow alerts + - "8025:8025" # Web UI → http://localhost:8025 (or use HOST_IP from .env) +# --------------------------------------------------------------------------- +# NAMED VOLUMES (Optional - not used directly since host paths are mounted) +# --------------------------------------------------------------------------- volumes: postgres_data: diff --git a/env/entrypoint-airflow.sh b/env/entrypoint-airflow.sh new file mode 100644 index 0000000..d3b48f8 --- /dev/null +++ b/env/entrypoint-airflow.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#set -euo pipefail + +cd /home/gamma + +if [ -n "${ALERT_EMAIL_SENDER:-}" ]; then + export AIRFLOW__SMTP__SMTP_MAIL_FROM="$ALERT_EMAIL_SENDER" +fi + +# Always use this email backend +export AIRFLOW__EMAIL__EMAIL_BACKEND=airflow.utils.email.send_email_smtp + +# Construct URLs from HOST_IP and ports (defined once in docker-compose.yaml) +# This allows changing HOST_IP in one place and having all URLs update automatically +HOST_IP="${HOST_IP:-localhost}" +MAILHOG_WEBUI_PORT="${MAILHOG_WEBUI_PORT:-8025}" +AIRFLOW_WEBUI_PORT="${AIRFLOW_WEBUI_PORT:-8080}" + +# Build URLs dynamically +export MAILHOG_WEBUI_URL="http://${HOST_IP}:${MAILHOG_WEBUI_PORT}" +export COSIFLOW_HOME_URL="http://${HOST_IP}:${AIRFLOW_WEBUI_PORT}/heasarcbrowser" + +echo "🌐 URLs configured:" +echo " MAILHOG_WEBUI_URL=${MAILHOG_WEBUI_URL}" +echo " COSIFLOW_HOME_URL=${COSIFLOW_HOME_URL}" + +# Export COSI directory structure environment variables if present +if [ -n "${COSI_DATA_DIR:-}" ]; then + export COSI_DATA_DIR="$COSI_DATA_DIR" +fi + +if [ -n "${COSI_OBS_DIR:-}" ]; then + export COSI_OBS_DIR="$COSI_OBS_DIR" +fi + +if [ -n "${COSI_TRANSIENT_DIR:-}" ]; then + export COSI_TRANSIENT_DIR="$COSI_TRANSIENT_DIR" +fi + +if [ -n "${COSI_TRIGGER_DIR:-}" ]; then + export COSI_TRIGGER_DIR="$COSI_TRIGGER_DIR" +fi + +if [ -n "${COSI_MAPS_DIR:-}" ]; then + export COSI_MAPS_DIR="$COSI_MAPS_DIR" +fi + +if [ -n "${COSI_SOURCE_DIR:-}" ]; then + export COSI_SOURCE_DIR="$COSI_SOURCE_DIR" +fi + +if [ -n "${COSI_INPUT_DIR:-}" ]; then + export COSI_INPUT_DIR="$COSI_INPUT_DIR" +fi + +if [ -n "${COSI_LOG_DIR:-}" ]; then + export COSI_LOG_DIR="$COSI_LOG_DIR" +fi + +# Create COSI directory structure if not present +mkdir -p $COSI_DATA_DIR/{obs,transient,trigger,maps,source} + +# Activate Python venv +if [ -f "/home/gamma/venv/bin/activate" ]; then + source /home/gamma/venv/bin/activate + echo "✅ Virtual environment activated." +else + echo "⚠️ venv activate script not found, assuming PATH is correct." +fi +# export PATH="$PATH:~/.local/bin" # Not needed with venv in PATH + +# Initialize Airflow DB +airflow db init + +# Create admin user if not present +if ! airflow users list | grep -q "$AIRFLOW_ADMIN_USERNAME"; then + airflow users create \ + --username "$AIRFLOW_ADMIN_USERNAME" \ + --firstname COSI \ + --lastname Admin \ + --role Admin \ + --email "$AIRFLOW_ADMIN_EMAIL" \ + --password "$AIRFLOW_ADMIN_PASSWORD" + echo "✅ Admin user created." +else + echo "ℹ️ Admin user already exists. Skipping creation." +fi + +# Start webserver (in background) and scheduler +airflow webserver --port 8080 & +airflow scheduler diff --git a/env/entrypoint.sh b/env/entrypoint.sh deleted file mode 100644 index 63ffe6e..0000000 --- a/env/entrypoint.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -cd /home/gamma -source activate gamma -export PATH="$PATH:~/.local/bin" -echo $PATH -airflow standalone diff --git a/env/environment.yml b/env/environment.yml index 070c95c..de1971b 100644 --- a/env/environment.yml +++ b/env/environment.yml @@ -6,6 +6,6 @@ dependencies: - root=6.26 - root_base=6.26 - pip - - python=3.10 # Specifica la versione di Python se necessario + - python=3.10 # Specify the Python version if needed - pip: - cosipy diff --git a/env/hot_load_module.sh b/env/hot_load_module.sh new file mode 100755 index 0000000..0db2393 --- /dev/null +++ b/env/hot_load_module.sh @@ -0,0 +1,1041 @@ +#!/bin/bash +# hot_load_module.sh +# Usage: +# $0 [install|remove|update] -d [dags] -p [pipeline] -f [images] +# +# Options (paths are relative to module root unless absolute): +# -d path to DAGs directory (default: src/dags) +# -p path to pipeline directory (default: src/pipeline) +# -f path to Docker context directory containing Dockerfile (default: env) +# -e create Python virtual environment(s) in container (default: false) +# -r path to requirements.txt file (default: env/requirements.txt, legacy mode) +# -E comma-separated list of environments to create from module_envs.yaml (e.g., env1,env2 or "all") +# -a create Python environment AND build Docker image (equivalent to -e with Docker build) +# -c path to config file (default: auto-detect in module) + +CONTAINER_USER="gamma" +CONTAINER_NAME="cosi_airflow" +EXTENSION_MODULE=".cfmodule" + +# Resolve absolute paths to avoid confusion depending on where script is run from +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKSPACE_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")" # Go up two levels: cosiflow/env -> cosiflow -> workspace + +# Default paths (relative to module root) +PATH_DAGS="src/dags" +PATH_PIPELINE="src/pipeline" +PATH_IMAGES="env" +CREATE_ENV=false +BUILD_DOCKER=false +PATH_REQUIREMENTS="env/requirements.txt" +VENV_PATH="/home/gamma/envs/cosipy" +ENV_SELECTION="" # Empty = use YAML enabled, "all" = all, or comma-separated list +CONFIG_FILE="" # Optional path to config file (overrides auto-detect when set) + +# Helper to run docker exec as airflow user +dexec() { + docker exec -u $CONTAINER_USER $CONTAINER_NAME "$@" +} + +# Function to parse YAML and extract top-level configuration values +parse_yaml_config() { + local yaml_file="$1" + local field="$2" # install_mode, or paths subfields: dags, pipeline, images + + if [ ! -f "$yaml_file" ]; then + return 1 + fi + + local in_paths=false + local paths_indent=0 + + while IFS= read -r line; do + # Remove comments but keep structure + line=$(echo "$line" | sed 's/#.*$//') + + # Skip empty lines + if [[ -z "${line// /}" ]]; then + continue + fi + + # Check for install_mode + if [ "$field" = "install_mode" ]; then + # Try with quotes first + if [[ "$line" =~ ^install_mode:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + # Try without quotes + elif [[ "$line" =~ ^install_mode:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + # Remove any trailing quotes or spaces + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//" | sed 's/[[:space:]]*$//') + echo "$val" + return 0 + fi + fi + + # Check for paths section + if [[ "$line" =~ ^paths: ]]; then + in_paths=true + paths_indent=$(echo "$line" | sed 's/[^ ].*//' | wc -c) + ((paths_indent--)) + continue + fi + + # If we hit another top-level key, stop looking in paths + if [ "$in_paths" = true ] && [[ "$line" =~ ^[a-zA-Z_][a-zA-Z0-9_]*: ]] && [[ ! "$line" =~ ^[[:space:]]+ ]]; then + in_paths=false + continue + fi + + # Extract path values + if [ "$in_paths" = true ]; then + case "$field" in + dags) + if [[ "$line" =~ ^[[:space:]]+dags:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+dags:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + pipeline) + if [[ "$line" =~ ^[[:space:]]+pipeline:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+pipeline:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + images) + if [[ "$line" =~ ^[[:space:]]+images:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+images:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + esac + fi + done < "$yaml_file" + + return 1 +} + +# Function to find YAML config file in module +find_yaml_config() { + local module_path="$1" + + # Try common names in module root + local yaml_file="$module_path/module_envs.yaml" + if [ -f "$yaml_file" ]; then + echo "$yaml_file" + return 0 + fi + + yaml_file="$module_path/cosiflow.config.yaml" + if [ -f "$yaml_file" ]; then + echo "$yaml_file" + return 0 + fi + + # Try to find any *.config.yaml file in module root + yaml_file=$(find "$module_path" -maxdepth 1 -name "*.config.yaml" -type f 2>/dev/null | head -n 1) + if [ -n "$yaml_file" ] && [ -f "$yaml_file" ]; then + echo "$yaml_file" + return 0 + fi + + # Try to find any *.config.yaml file in env/ subdirectory + if [ -d "$module_path/env" ]; then + yaml_file=$(find "$module_path/env" -maxdepth 1 -name "*.config.yaml" -type f 2>/dev/null | head -n 1) + if [ -n "$yaml_file" ] && [ -f "$yaml_file" ]; then + echo "$yaml_file" + return 0 + fi + fi + + return 1 +} + +# Function to load configuration from YAML file +# Usage: load_yaml_config [config_file_path] +# If config_file_path is given, it is used; otherwise config is auto-detected in module. +load_yaml_config() { + local module_path="$1" + local yaml_file + + if [ -n "$2" ] && [ -f "$2" ]; then + yaml_file="$2" + else + # Find YAML config file + local found_yaml=$(find_yaml_config "$module_path") + if [ $? -ne 0 ] || [ -z "$found_yaml" ]; then + return 1 + fi + yaml_file="$found_yaml" + fi + + # Load install_mode + local install_mode=$(parse_yaml_config "$yaml_file" "install_mode") + if [ -n "$install_mode" ]; then + case "$install_mode" in + container) + BUILD_DOCKER=true + CREATE_ENV=false + ;; + environment) + BUILD_DOCKER=false + CREATE_ENV=true + ;; + both) + BUILD_DOCKER=true + CREATE_ENV=true + ;; + none) + BUILD_DOCKER=false + CREATE_ENV=false + ;; + esac + fi + + # Load paths + local dags_path=$(parse_yaml_config "$yaml_file" "dags") + if [ -n "$dags_path" ]; then + PATH_DAGS="$dags_path" + fi + + local pipeline_path=$(parse_yaml_config "$yaml_file" "pipeline") + if [ -n "$pipeline_path" ]; then + PATH_PIPELINE="$pipeline_path" + fi + + local images_path=$(parse_yaml_config "$yaml_file" "images") + if [ -n "$images_path" ]; then + PATH_IMAGES="$images_path" + fi + + return 0 +} + +# Parse positional args: module_name [action] +MODULE_NAME=$1 +if [ "$2" = "install" ] || [ "$2" = "remove" ] || [ "$2" = "update" ]; then + ACTION=$2 + shift 2 +else + ACTION=install + shift 1 +fi + +# Load configuration from YAML if module exists and YAML file is present +MODULE_PATH="$WORKSPACE_ROOT/$MODULE_NAME" +if [ -d "$MODULE_PATH" ]; then + # Load YAML config as defaults (will be overridden by CLI options) + load_yaml_config "$MODULE_PATH" +fi + +# Parse options -d, -p, -f, -e, -r, -a, -E, -c +# CLI options override YAML configuration +while getopts "d:p:f:er:aE:c:" opt; do + case $opt in + d) PATH_DAGS="$OPTARG" ;; + p) PATH_PIPELINE="$OPTARG" ;; + f) PATH_IMAGES="$OPTARG" ;; + e) CREATE_ENV=true ;; + r) PATH_REQUIREMENTS="$OPTARG" ;; + a) CREATE_ENV=true; BUILD_DOCKER=true ;; + E) CREATE_ENV=true; ENV_SELECTION="$OPTARG" ;; + c) CONFIG_FILE="$OPTARG" ;; + :) echo "Option -$OPTARG requires an argument." >&2; exit 1 ;; + *) echo "Usage: $0 [install|remove|update] -d [dags] -p [pipeline] -f [images] -e [-r requirements.txt] -E [env1,env2|all] -a [-c config.yaml]" >&2; exit 1 ;; + esac +done + +if [ -z "$MODULE_NAME" ]; then + echo "Usage: $0 [install|remove|update] -d [dags] -p [pipeline] -f [images] -e [-r requirements.txt] -E [env1,env2|all] -a [-c config.yaml]" + echo "" + echo "Options (paths relative to module root unless absolute):" + echo " -d path to DAGs directory (default: src/dags)" + echo " -p path to pipeline directory (default: src/pipeline)" + echo " -f path to Docker context directory (default: env)" + echo " -e create Python virtual environment(s) in container (default: false)" + echo " -r path to requirements.txt file (default: env/requirements.txt, legacy single-env mode)" + echo " -E comma-separated list of environments from module_envs.yaml (e.g., env1,env2) or 'all'" + echo " If -E is used, module_envs.yaml will be read from module root" + echo " -a create Python environment AND build Docker image (equivalent to -e with Docker build)" + echo " -c path to config file (default: auto-detect in module)" + echo "" + echo "Examples:" + echo " # Legacy mode: single environment" + echo " $0 mymodule install -e -r env/requirements.txt" + echo "" + echo " # Multi-environment mode: install enabled environments from YAML" + echo " $0 mymodule install -e" + echo "" + echo " # Multi-environment mode: install specific environments" + echo " $0 mymodule install -E env1,env2" + echo "" + echo " # Multi-environment mode: install all environments" + echo " $0 mymodule install -E all" + echo "" + echo " # Use a specific config file (e.g. tutorial config under docs/)" + echo " $0 mymodule install -c docs/tutorials/bgo-loc/cosiflow/env/bgoloc.config.yaml" + exit 1 +fi + +# If -c was used: resolve to absolute path and reload config from that file +if [ -n "$CONFIG_FILE" ]; then + if [ ! -f "$CONFIG_FILE" ]; then + echo "❌ Config file not found: $CONFIG_FILE" >&2 + exit 1 + fi + CONFIG_FILE="$(cd "$(dirname "$CONFIG_FILE")" && pwd)/$(basename "$CONFIG_FILE")" + load_yaml_config "$MODULE_PATH" "$CONFIG_FILE" +fi + +# Helper to run docker exec as airflow user +dexec() { + docker exec -u $CONTAINER_USER $CONTAINER_NAME "$@" +} + +# Function to parse YAML and extract environment configurations +# Uses simple pattern matching (works without yq dependency) +parse_yaml_envs() { + local yaml_file="$1" + local env_name="$2" + local field="$3" # requirements, venv_path, enabled, description, python_version + + if [ ! -f "$yaml_file" ]; then + return 1 + fi + + # Simple YAML parsing - look for the environment block + local in_environments=false + local in_target_env=false + local env_indent_level=0 + + while IFS= read -r line; do + local original_line="$line" + # Remove comments but keep the line structure + line=$(echo "$line" | sed 's/#.*$//') + + # Skip completely empty lines + if [[ -z "${line// /}" ]]; then + continue + fi + + # Detect environments section + if [[ "$line" =~ ^environments: ]]; then + in_environments=true + continue + fi + + # If we hit another top-level key, stop looking + if [ "$in_environments" = true ] && [[ "$line" =~ ^[a-zA-Z_][a-zA-Z0-9_]*: ]] && [[ ! "$line" =~ ^[[:space:]]+ ]]; then + break + fi + + # Check if we're entering the target environment block + if [ "$in_environments" = true ] && [[ "$line" =~ ^[[:space:]]+${env_name}: ]]; then + in_target_env=true + # Count leading spaces to determine indent level + env_indent_level=$(echo "$line" | sed 's/[^ ].*//' | wc -c) + ((env_indent_level--)) + continue + fi + + # Check if we're leaving the target environment block (another env at same or less indent) + if [ "$in_target_env" = true ]; then + local current_indent=$(echo "$line" | sed 's/[^ ].*//' | wc -c) + ((current_indent--)) + + # If we hit another environment at same or less indent, we've left our target + if [[ "$line" =~ ^[[:space:]]*[a-zA-Z0-9_]+: ]] && [ $current_indent -le $env_indent_level ] && [[ ! "$line" =~ ^[[:space:]]+${env_name}: ]]; then + in_target_env=false + continue + fi + + # Extract field value if we're in the target environment + if [ "$in_target_env" = true ]; then + case "$field" in + requirements) + if [[ "$line" =~ ^[[:space:]]+requirements:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+requirements:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + venv_path) + if [[ "$line" =~ ^[[:space:]]+venv_path:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+venv_path:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + enabled) + if [[ "$line" =~ ^[[:space:]]+enabled:[[:space:]]*(true|false) ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+enabled:[[:space:]]+[\"']?(true|false)[\"']? ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + fi + ;; + description) + if [[ "$line" =~ ^[[:space:]]+description:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+description:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + python_version) + if [[ "$line" =~ ^[[:space:]]+python_version:[[:space:]]*[\"']?([0-9.]+)[\"']? ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+python_version:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + requirements_no_deps) + if [[ "$line" =~ ^[[:space:]]+requirements_no_deps:[[:space:]]*[\"'](.+)[\"'] ]]; then + echo "${BASH_REMATCH[1]}" + return 0 + elif [[ "$line" =~ ^[[:space:]]+requirements_no_deps:[[:space:]]+(.+) ]]; then + local val="${BASH_REMATCH[1]}" + val=$(echo "$val" | sed "s/^[\"']//;s/[\"']$//") + echo "$val" + return 0 + fi + ;; + esac + fi + fi + done < "$yaml_file" + + return 1 +} + +# Function to list all environments from YAML +list_yaml_envs() { + local yaml_file="$1" + + if [ ! -f "$yaml_file" ]; then + return 1 + fi + + # Extract environment names (lines with "env_name:" that are indented under "environments:") + # Only match keys with exactly 2 spaces of indent (environment names, not their fields) + local in_environments=false + while IFS= read -r line; do + # Remove comments but preserve structure + local original_line="$line" + line=$(echo "$line" | sed 's/#.*$//') + + # Skip empty lines + if [[ -z "${line// /}" ]]; then + continue + fi + + # Check if we're in the environments section + if [[ "$line" =~ ^environments: ]]; then + in_environments=true + continue + fi + + # If we hit another top-level key (no leading spaces), stop + if [ "$in_environments" = true ] && [[ "$line" =~ ^[a-zA-Z_][a-zA-Z0-9_]*: ]] && [[ ! "$line" =~ ^[[:space:]]+ ]]; then + break + fi + + # Extract environment names (keys with exactly 2 spaces indent, not 4+ which are fields) + if [ "$in_environments" = true ]; then + # Count leading spaces + local leading_spaces=$(echo "$line" | sed 's/[^ ].*//' | wc -c) + ((leading_spaces--)) + + # Only match lines with exactly 2 spaces (environment names) + # Match pattern: exactly 2 spaces, then a key name ending with colon + if [ $leading_spaces -eq 2 ] && [[ "$line" =~ ^[[:space:]][[:space:]]([a-zA-Z0-9_]+):[[:space:]]*$ ]]; then + echo "${BASH_REMATCH[1]}" + elif [ $leading_spaces -eq 2 ] && [[ "$line" =~ ^[[:space:]][[:space:]]([a-zA-Z0-9_]+): ]]; then + echo "${BASH_REMATCH[1]}" + fi + fi + done < "$yaml_file" +} + +# Function to create a single Python virtual environment +# Optional 4th argument: python_version (e.g. 3.11) to use python3.11 -m venv; if empty, uses python3 +# Optional 5th argument: requirements_no_deps file path; if set, installed after main requirements with pip --no-deps +create_single_env() { + local env_name="$1" + local requirements_file="$2" + local venv_path="$3" + local python_version="${4:-}" + local requirements_no_deps_file="${5:-}" + + # Choose Python interpreter: python3.11, python3.10, etc., or default python3 + local python_bin="python3" + if [ -n "$python_version" ]; then + # Normalize: "3.11" -> python3.11; already "python3.11" -> use as-is + if [[ "$python_version" =~ ^python ]]; then + python_bin="$python_version" + else + python_bin="python${python_version}" + fi + echo " 🐍 Using Python: $python_bin" + fi + + echo " 📦 Creating environment '$env_name' at $venv_path..." + + # Create venv + dexec $python_bin -m venv "$venv_path" 2>/dev/null || { + # If venv already exists, remove it first + echo " ⚠️ Virtual environment already exists, removing old one..." + dexec rm -rf "$venv_path" + dexec $python_bin -m venv "$venv_path" + } + + if [ $? -ne 0 ]; then + echo " ❌ Failed to create virtual environment for '$env_name'." + return 1 + fi + + # Copy requirements file to container + local temp_req="/tmp/requirements_${env_name}.txt" + echo " 📋 Copying requirements file to container..." + docker cp "$requirements_file" "$CONTAINER_NAME:$temp_req" + + if [ $? -ne 0 ]; then + echo " ❌ Failed to copy requirements file." + return 1 + fi + + # Install packages + echo " 🔧 Installing packages..." + dexec bash -c ". $venv_path/bin/activate && \ + pip install --no-cache-dir --upgrade pip setuptools wheel && \ + pip install --no-cache-dir -r $temp_req" + + local install_status=$? + + # Cleanup main requirements + dexec rm -f "$temp_req" + + if [ $install_status -ne 0 ]; then + echo " ❌ Failed to install packages for '$env_name'." + return 1 + fi + + # Optional: install extra requirements with --no-deps (e.g. to avoid dependency conflicts) + if [ -n "$requirements_no_deps_file" ] && [ -f "$requirements_no_deps_file" ]; then + local temp_nodeps="/tmp/requirements_${env_name}_nodeps.txt" + echo " 📋 Installing extra packages (--no-deps)..." + docker cp "$requirements_no_deps_file" "$CONTAINER_NAME:$temp_nodeps" + if [ $? -eq 0 ]; then + dexec bash -c ". $venv_path/bin/activate && pip install --no-cache-dir --no-deps -r $temp_nodeps" + dexec rm -f "$temp_nodeps" + fi + fi + + # Create activation helper script for this environment + local activate_script="/home/gamma/activate_${env_name}.sh" + dexec bash -c "cat > $activate_script << 'EOF' +#!/bin/bash +# Helper script to activate the $env_name virtual environment +source $venv_path/bin/activate +echo \"✅ Activated Python environment: \$VIRTUAL_ENV\" +echo \"Python path: \$(which python)\" +EOF + chmod +x $activate_script" + + echo " ✅ Environment '$env_name' created successfully." + echo " 💡 Activate with: source $activate_script" + echo " 💡 Or use: $venv_path/bin/python" + + return 0 +} + +echo "🔹 Action: $ACTION module '$MODULE_NAME'" + +# ============================================================================== +# REMOVE +# ============================================================================== +if [ "$ACTION" == "remove" ]; then + echo "Removing module '$MODULE_NAME'..." + + MODULE_PATH="$WORKSPACE_ROOT/$MODULE_NAME" + if [ -n "$CONFIG_FILE" ]; then + YAML_CONFIG="$CONFIG_FILE" + else + YAML_CONFIG=$(find_yaml_config "$MODULE_PATH") + fi + + # Load config to know what to remove + remove_envs=false + remove_container=false + + if [ -n "$YAML_CONFIG" ] && [ -f "$YAML_CONFIG" ]; then + echo "📄 Using configuration from: $(basename "$YAML_CONFIG")" + + # Check what was configured to install + install_mode=$(parse_yaml_config "$YAML_CONFIG" "install_mode") + case "$install_mode" in + container|both) + remove_container=true + ;; + esac + + # Check if environments were configured + all_envs=($(list_yaml_envs "$YAML_CONFIG")) + if [ ${#all_envs[@]} -gt 0 ]; then + remove_envs=true + fi + + # Load paths from config + dags_path=$(parse_yaml_config "$YAML_CONFIG" "dags") + if [ -n "$dags_path" ]; then + PATH_DAGS="$dags_path" + fi + + pipeline_path=$(parse_yaml_config "$YAML_CONFIG" "pipeline") + if [ -n "$pipeline_path" ]; then + PATH_PIPELINE="$pipeline_path" + fi + else + # Legacy mode: remove everything + remove_envs=true + remove_container=true + fi + + # Remove DAGs link + echo " 🔗 Removing DAGs link..." + dexec rm -f /home/gamma/airflow/dags/$MODULE_NAME$EXTENSION_MODULE 2>/dev/null + echo " ✅ DAGs link removed." + + # Remove Pipeline scripts link (only if pipeline path was configured) + if [ -n "$PATH_PIPELINE" ]; then + echo " 🔗 Removing Pipeline scripts link..." + dexec rm -f /home/gamma/airflow/pipeline/$MODULE_NAME$EXTENSION_MODULE 2>/dev/null + echo " ✅ Pipeline scripts link removed." + fi + + # Remove Python virtual environments (if configured) + if [ "$remove_envs" = true ]; then + if [ -n "$YAML_CONFIG" ] && [ -f "$YAML_CONFIG" ]; then + echo " 🐍 Removing Python virtual environments..." + all_envs=($(list_yaml_envs "$YAML_CONFIG")) + if [ ${#all_envs[@]} -gt 0 ]; then + for env_name in "${all_envs[@]}"; do + venv_path_yaml=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "venv_path") + venv_path="${venv_path_yaml:-/home/gamma/envs/$env_name}" + activate_script="/home/gamma/activate_${env_name}.sh" + + dexec rm -rf "$venv_path" 2>/dev/null + dexec rm -f "$activate_script" 2>/dev/null + echo " ✅ Removed environment '$env_name'" + done + else + echo " ⚠️ No environments found in config." + fi + else + # Legacy: remove default cosipy environment + dexec rm -rf "$VENV_PATH" 2>/dev/null + dexec rm -f /home/gamma/activate_cosipy.sh 2>/dev/null + echo " ✅ Removed default Python environment." + fi + else + echo " ⏭️ Skipping Python environments (not configured)." + fi + + # Remove Docker image (if configured) + if [ "$remove_container" = true ]; then + echo " 🐳 Removing Docker image..." + docker rmi -f ${MODULE_NAME}:latest 2>/dev/null + if [ $? -eq 0 ]; then + echo " ✅ Removed image ${MODULE_NAME}:latest." + else + echo " ⚠️ Image ${MODULE_NAME}:latest not found or already removed." + fi + else + echo " ⏭️ Skipping Docker image (not configured)." + fi + + echo "" + echo "✅ Module $MODULE_NAME removed from Airflow." + exit 0 +fi + +# ============================================================================== +# INSTALL / UPDATE +# ============================================================================== +if [ "$ACTION" == "install" ] || [ "$ACTION" == "update" ]; then + + # Find YAML config file (or use -c path) + if [ -n "$CONFIG_FILE" ]; then + YAML_CONFIG="$CONFIG_FILE" + else + YAML_CONFIG=$(find_yaml_config "$MODULE_PATH") + fi + + if [ -n "$YAML_CONFIG" ] && [ -f "$YAML_CONFIG" ]; then + echo "📄 Configuration loaded from: $(basename "$YAML_CONFIG")" + install_mode=$(parse_yaml_config "$YAML_CONFIG" "install_mode") + + # Debug: check what was read + if [ -z "$install_mode" ]; then + echo " ⚠️ Warning: install_mode not found or empty in YAML" + echo " Trying direct grep..." + install_mode=$(grep -E "^install_mode:" "$YAML_CONFIG" | sed 's/^install_mode:[[:space:]]*//' | sed 's/[[:space:]]*$//' | head -n 1) + fi + + if [ -n "$install_mode" ]; then + echo " Install mode: $install_mode" + else + echo " ⚠️ Install mode: (empty or not found)" + fi + echo "" + + # Apply install_mode settings directly (in case load_yaml_config wasn't called earlier or didn't find the file) + if [ -n "$install_mode" ]; then + case "$install_mode" in + container) + BUILD_DOCKER=true + CREATE_ENV=false + ;; + environment) + BUILD_DOCKER=false + CREATE_ENV=true + ;; + both) + BUILD_DOCKER=true + CREATE_ENV=true + ;; + none) + BUILD_DOCKER=false + CREATE_ENV=false + ;; + esac + # Debug output + echo " 🔧 Applied settings: BUILD_DOCKER=$BUILD_DOCKER, CREATE_ENV=$CREATE_ENV" + echo "" + else + echo " ⚠️ Cannot apply install_mode: value is empty" + echo "" + fi + + # Reload config to ensure we have latest paths (in case YAML was found after initial load) + load_yaml_config "$MODULE_PATH" + + # Ensure install_mode is still applied after load_yaml_config (it might override) + if [ -n "$install_mode" ]; then + case "$install_mode" in + container) + BUILD_DOCKER=true + CREATE_ENV=false + ;; + environment) + BUILD_DOCKER=false + CREATE_ENV=true + ;; + both) + BUILD_DOCKER=true + CREATE_ENV=true + ;; + none) + BUILD_DOCKER=false + CREATE_ENV=false + ;; + esac + fi + fi + + # 1. Link DAGs (Airflow needs the DAG definition) + echo "1️⃣ Linking DAGs..." + dexec ln -sfn /home/gamma/airflow/modules_pool/$MODULE_NAME/$PATH_DAGS /home/gamma/airflow/dags/$MODULE_NAME$EXTENSION_MODULE + + if [ $? -eq 0 ]; then + echo " ✅ DAGs linked." + else + echo " ❌ Failed to link DAGs." + exit 1 + fi + + # 2. Link Pipeline scripts (only if pipeline path is configured) + if [ -n "$PATH_PIPELINE" ]; then + echo "2️⃣ Linking Pipeline scripts..." + dexec ln -sfn /home/gamma/airflow/modules_pool/$MODULE_NAME/$PATH_PIPELINE /home/gamma/airflow/pipeline/$MODULE_NAME$EXTENSION_MODULE + + if [ $? -eq 0 ]; then + echo " ✅ Pipeline scripts linked." + else + echo " ❌ Failed to link Pipeline scripts." + exit 1 + fi + else + echo "2️⃣ ⏭️ Skipping Pipeline scripts (not configured)." + fi + + # 3. Create Python Virtual Environment(s) (if requested) + if [ "$CREATE_ENV" == true ]; then + if [ "$ACTION" == "update" ]; then + echo "3️⃣ Updating Python Virtual Environment(s)..." + else + echo "3️⃣ Creating Python Virtual Environment(s)..." + fi + + # Check if YAML config exists and -E flag was used (multi-env mode) + if [ -n "$YAML_CONFIG" ] && [ -f "$YAML_CONFIG" ] && [ -n "$ENV_SELECTION" ]; then + echo " 📄 Using multi-environment mode with $(basename "$YAML_CONFIG")" + + # Determine which environments to create + local envs_to_create=() + + if [ "$ENV_SELECTION" = "all" ]; then + # Get all environments from YAML + envs_to_create=($(list_yaml_envs "$YAML_CONFIG")) + else + # Parse comma-separated list + IFS=',' read -ra envs_to_create <<< "$ENV_SELECTION" + fi + + if [ ${#envs_to_create[@]} -eq 0 ]; then + echo " ⚠️ No environments found or specified." + echo " Skipping environment creation." + else + local success_count=0 + local fail_count=0 + + for env_name in "${envs_to_create[@]}"; do + env_name=$(echo "$env_name" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + # Extract environment configuration from YAML + local req_path=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "requirements") + local venv_path_yaml=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "venv_path") + local enabled=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "enabled") + local description=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "description") + local python_version=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "python_version") + local req_no_deps=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "requirements_no_deps") + + if [ -z "$req_path" ]; then + echo " ⚠️ Environment '$env_name' not found in YAML or missing requirements." + ((fail_count++)) + continue + fi + + # Resolve requirements file path + if [ "${req_path:0:1}" = "/" ]; then + REQUIREMENTS_FILE="$req_path" + else + REQUIREMENTS_FILE="$MODULE_PATH/$req_path" + fi + + # Resolve requirements_no_deps path (optional) + local REQUIREMENTS_NODEPS_FILE="" + if [ -n "$req_no_deps" ]; then + if [ "${req_no_deps:0:1}" = "/" ]; then + REQUIREMENTS_NODEPS_FILE="$req_no_deps" + else + REQUIREMENTS_NODEPS_FILE="$MODULE_PATH/$req_no_deps" + fi + fi + + # Use venv_path from YAML or default + local final_venv_path="${venv_path_yaml:-/home/gamma/envs/$env_name}" + + # Show description if available + if [ -n "$description" ]; then + echo " 📝 $description" + fi + + # Check if requirements file exists + if [ ! -f "$REQUIREMENTS_FILE" ]; then + echo " ⚠️ Requirements file not found: $REQUIREMENTS_FILE" + echo " Skipping environment '$env_name'." + ((fail_count++)) + continue + fi + + # Create the environment + if create_single_env "$env_name" "$REQUIREMENTS_FILE" "$final_venv_path" "$python_version" "$REQUIREMENTS_NODEPS_FILE"; then + ((success_count++)) + else + ((fail_count++)) + fi + echo "" + done + + echo " 📊 Summary: $success_count environment(s) created successfully" + if [ $fail_count -gt 0 ]; then + echo " ⚠️ $fail_count environment(s) failed" + fi + fi + + elif [ -n "$YAML_CONFIG" ] && [ -f "$YAML_CONFIG" ] && [ -z "$ENV_SELECTION" ]; then + # YAML exists but no -E flag: use enabled environments + echo " 📄 Using $(basename "$YAML_CONFIG") (installing enabled environments)" + + all_envs=($(list_yaml_envs "$YAML_CONFIG")) + + # Debug: show what was found + if [ ${#all_envs[@]} -eq 0 ]; then + echo " ⚠️ No environments found in YAML file: $YAML_CONFIG" + else + echo " 📋 Found ${#all_envs[@]} environment(s): ${all_envs[*]}" + fi + + envs_to_create=() + + for env_name in "${all_envs[@]}"; do + enabled=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "enabled") + if [ "$enabled" = "true" ]; then + envs_to_create+=("$env_name") + fi + done + + if [ ${#envs_to_create[@]} -eq 0 ]; then + if [ ${#all_envs[@]} -eq 0 ]; then + echo " ⚠️ No environments found in YAML." + else + echo " ⚠️ No enabled environments found in YAML (found ${#all_envs[@]} environment(s) but none are enabled)." + fi + echo " Use -E flag to specify environments or enable them in YAML." + else + success_count=0 + fail_count=0 + + for env_name in "${envs_to_create[@]}"; do + req_path=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "requirements") + venv_path_yaml=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "venv_path") + description=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "description") + python_version=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "python_version") + req_no_deps=$(parse_yaml_envs "$YAML_CONFIG" "$env_name" "requirements_no_deps") + + if [ "${req_path:0:1}" = "/" ]; then + REQUIREMENTS_FILE="$req_path" + else + REQUIREMENTS_FILE="$MODULE_PATH/$req_path" + fi + + REQUIREMENTS_NODEPS_FILE="" + if [ -n "$req_no_deps" ]; then + if [ "${req_no_deps:0:1}" = "/" ]; then + REQUIREMENTS_NODEPS_FILE="$req_no_deps" + else + REQUIREMENTS_NODEPS_FILE="$MODULE_PATH/$req_no_deps" + fi + fi + + final_venv_path="${venv_path_yaml:-/home/gamma/envs/$env_name}" + + if [ -n "$description" ]; then + echo " 📝 $description" + fi + + if [ ! -f "$REQUIREMENTS_FILE" ]; then + echo " ⚠️ Requirements file not found: $REQUIREMENTS_FILE" + ((fail_count++)) + continue + fi + + if create_single_env "$env_name" "$REQUIREMENTS_FILE" "$final_venv_path" "$python_version" "$REQUIREMENTS_NODEPS_FILE"; then + ((success_count++)) + else + ((fail_count++)) + fi + echo "" + done + + echo " 📊 Summary: $success_count environment(s) created successfully" + if [ $fail_count -gt 0 ]; then + echo " ⚠️ $fail_count environment(s) failed" + fi + fi + + else + # Legacy mode: single environment with -r flag or default + echo " 📦 Using legacy single-environment mode" + + if [ "${PATH_REQUIREMENTS:0:1}" = "/" ]; then + REQUIREMENTS_FILE="$PATH_REQUIREMENTS" + else + REQUIREMENTS_FILE="$MODULE_PATH/$PATH_REQUIREMENTS" + fi + + if [ ! -f "$REQUIREMENTS_FILE" ]; then + echo " ⚠️ Requirements file not found: $REQUIREMENTS_FILE" + echo " Skipping environment creation." + else + if create_single_env "cosipy" "$REQUIREMENTS_FILE" "$VENV_PATH"; then + echo "" + else + exit 1 + fi + fi + fi + fi + + # 4. Build/Prepare Docker Image (if requested) + if [ "$BUILD_DOCKER" == true ]; then + echo "4️⃣ Building Docker Image..." + + # Resolve Docker context path + if [ "${PATH_IMAGES:0:1}" = "/" ]; then + DOCKER_CONTEXT="$PATH_IMAGES" + else + DOCKER_CONTEXT="$MODULE_PATH/$PATH_IMAGES" + fi + + if [ -d "$MODULE_PATH" ] && [ -f "$DOCKER_CONTEXT/Dockerfile" ]; then + echo " Found Dockerfile in $DOCKER_CONTEXT" + echo " Building image '${MODULE_NAME}:latest'..." + + # Build the image on the HOST + docker build -t "${MODULE_NAME}:latest" "$DOCKER_CONTEXT" + + if [ $? -eq 0 ]; then + echo " ✅ Image '${MODULE_NAME}:latest' built successfully." + else + echo " ❌ Docker build failed." + exit 1 + fi + else + echo " ❌ Docker build requested but Dockerfile not found." + echo " Checked path: $DOCKER_CONTEXT/Dockerfile" + exit 1 + fi + else + echo "4️⃣ ⏭️ Skipping Docker image build (not requested)." + fi + + echo "🎉 Module $MODULE_NAME ready!" + exit 0 +fi + +echo "❌ Unknown action: $ACTION. Use [install|remove|update]" +exit 1 diff --git a/modules/README.md b/modules/README.md new file mode 100644 index 0000000..b8069cb --- /dev/null +++ b/modules/README.md @@ -0,0 +1,269 @@ +# CosiDAG + +A high-level reactive DAG template for filesystem-driven scientific workflows + +## Overview + +`COSIDAG` is a convenience subclass of Airflow’s `DAG` designed to simplify the creation of **file-driven scientific pipelines**. +It encapsulates a standard five-step workflow pattern: + +1. **check_new_file** + Monitors one or more folders, searching for new subdirectories. + Handles date filtering, basename filtering, depth traversal, and stability checks. + Pushes the detected folder path into XCom and tracks processed folders via Airflow Variables. + +2. **automatic_retrig** + Immediately triggers a new run of the same DAG, so the sensor keeps watching for fresh data. + This enables near-real-time reactive pipelines. + +3. **resolve_inputs** *(optional)* + If `file_patterns` is provided, the module automatically scans the detected folder, resolves filenames, and pushes the results to XCom using user-defined keys. + +4. **custom tasks** + A user-provided `build_custom(dag)` function can attach any processing tasks (analysis, binning, model execution, visualization). + These tasks use the values produced by steps 1–3 via `xcom_pull`. + +5. **show_results** + Logs a homepage URL (read from an environment variable) and optionally builds a deep link referencing the detected folder. + +This structure removes 80–90% of the boilerplate typically involved in writing dynamic pipelines while ensuring consistency across future COSI workflows (e.g. **TSMap**, **Light Curve**). + +--- + +## Why use CosiDAG? + +CosiDAG solves common problems in scientific workflows: + +* **Dynamic file discovery** — your pipeline reacts automatically to new folders dropped on disk. +* **No hard-coded filenames** — files are resolved automatically using regex patterns. +* **Unified behavior** — TSMap, Light Curve, and other pipelines share the same structure. +* **Clean separation of infrastructure vs science code** — COSIDAG handles monitoring, deduplication, and XCom logic; the user only implements the scientific tasks. +* **Consistent task orchestration** — every pipeline follows the same five-step DAG layout. + +Additionally, CosiDAG integrates seamlessly with the **MailHog link plugin**, which captures and exposes exception emails directly in the Airflow UI. +This means that mailbox-based alerting and debugging works **out-of-the-box** with all CosiDAG-derived workflows, without requiring extra configuration. + +--- + +## Minimal Example + +```python +from datetime import datetime +from cosidag import COSIDAG +from airflow.operators.python import ExternalPythonOperator + +def build_custom(dag): + + # Pull runtime-discovered folder and file paths + RUN_DIR = "{{ ti.xcom_pull('check_new_file', key='detected_folder') }}" + RESPONSE = "{{ ti.xcom_pull('resolve_inputs', key='response_file') }}" + + def compute(run_dir: str, response_file: str): + print("Running analysis...", run_dir, response_file) + + compute_task = ExternalPythonOperator( + task_id="compute_step", + python="/path/to/external/env/bin/python", + python_callable=compute, + op_kwargs={"run_dir": RUN_DIR, "response_file": RESPONSE}, + dag=dag, + ) + + return [compute_task] + +with COSIDAG( + dag_id="example_cosidag", + schedule_interval=None, + start_date=datetime(2025, 1, 1), + monitoring_folders=["/data/incoming"], + file_patterns={ + "response_file": r"Response.*\.h5" + }, + select_policy="latest_mtime", + only_basename="products", + prefer_deepest=True, + idle_seconds=5, + build_custom=build_custom, + tags=["example"], +): + pass +``` + +--- + +## How CosiDAG Passes Data Between Tasks + +CosiDAG uses **XCom** to pass runtime-discovered paths to the user-defined tasks. + +* `check_new_file` pushes the detected folder: + + ``` + key="detected_folder" + ``` +* `resolve_inputs` pushes files matched by regex patterns, using the corresponding keys: + + ``` + "response_file": "/path/to/Response_003.h5" + ``` + +User tasks retrieve them via: + +```python +"{{ ti.xcom_pull('check_new_file', key='detected_folder') }}" +"{{ ti.xcom_pull('resolve_inputs', key='response_file') }}" +``` + +This allows pipelines to be fully dynamic and independent of hard-coded paths. + +--- + +## Configuration Parameters + +| Parameter | Type | Description | +| ----------------------------- | --------------------------- | ------------------------------------------------------------ | +| `monitoring_folders` | list[str] | Folders to scan for new data. | +| `level` | int | Directory depth to scan. | +| `date` | str/int | Accept only folders with this date. | +| `date_queries` | str | Query expression for date filtering (e.g. `==20251119`). | +| `only_basename` | str | Accept only folders with this basename (e.g., `"products"`). | +| `prefer_deepest` | bool | Selects deepest matching subfolder. | +| `min_files` | int | Minimum number of files required before accepting a folder. | +| `idle_seconds` | int | Seconds to wait for the folder to “settle” (no live writes). | +| `ready_marker` | str | Marker file required for folder acceptance. | +| `home_env_var` | str | ENV var containing the UI base URL. | +| `file_patterns` | dict[str, str] | Mapping XCom key → regex pattern for auto file resolution. | +| `select_policy` | `"latest_mtime"`, `"first"` | Strategy for resolving multiple matches. | +| `default_args_extra` | dict | Additional default args for tasks. | +| `tags` | list[str] | Airflow UI tags. | +| `auto_retrig` | bool | Enables real-time monitoring. | +| `processed_variable` | str | Name of the Airflow Variable storing processed paths. | +| `builder_fn` / `build_custom` | callable | Function that attaches user-defined tasks. | +| `xcom_detected_key` | str | XCom key for detected folder path. | + +--- + +## Configuring and Running a CosiDAG from the Airflow UI + +Once a CosiDAG script is defined, **you do not need to modify the Python file** to run the pipeline on different datasets. +Instead, Airflow’s Trigger UI allows you to dynamically set: + +* monitoring folders +* date filters +* file patterns +* selection policy +* any custom configuration values defined in your DAG parameters + +This makes CosiDAG pipelines fully reusable: **the same code can be triggered dozens of times with different inputs**, without editing the script. + +To run a CosiDAG on new data: + +1. Open the DAG in the Airflow Web UI +2. Click **Trigger DAG** +3. Fill in the configuration form (folder paths, patterns, etc.) +4. Click **Trigger** + +Every run will process a different dataset with identical logic. + +--- + +## Processed Folder Tracking (Airflow Variable) + +Every CosiDAG keeps track of previously processed folders using an Airflow Variable named: + +``` +COSIDAG_PROCESSED:: +``` + +This prevents the pipeline from reprocessing the same folder unless explicitly requested. + +### Viewing the stored folders + +From the CLI: + +```bash +airflow variables get COSIDAG_PROCESSED:: +``` + +### Clearing the list (e.g. to reprocess everything) + +```bash +airflow variables set COSIDAG_PROCESSED:: "[]" +``` + +### Removing the variable entirely + +```bash +airflow variables delete COSIDAG_PROCESSED:: +``` + +These commands allow you to “reset” the monitoring history at any time. + +--- + +## Disabling Automatic Retrigger + +By default, CosiDAG enables **automatic retriggering** (`automatic_retrig=True`), meaning the DAG keeps running in a loop to continuously watch for new folders. + +You can disable this behavior by setting: + +```python +auto_retrig=False +``` + +or by exposing it as a configurable parameter and turning it off in the Airflow UI. + +When retriggering is disabled: + +* The DAG will **not** restart automatically +* You can manually rerun the pipeline on a folder that was already processed +* This is useful for **re-analysis**, debugging, or running multiple configurations on the same dataset + +Disabling retriggering + clearing the processed-variable list lets you fully reprocess any folder without modifying the DAG code. + +--- + +## MailHog Link Plugin (Exception Visibility) + +CosiDAG integrates cleanly with the Airflow **MailHog link plugin**, a small extension that: + +* intercepts exception emails generated by Airflow, +* displays a direct link to the captured email next to the failed task in the Airflow UI. + +While you don’t need to configure anything manually, it is useful to know that: + +* When a CosiDAG-based pipeline fails, any email alerts triggered by Airflow’s email backend will appear in the MailHog UI. +* COSIFLOW’s environment already includes MailHog and the plugin, so notifications are automatically routed and linked. +* This provides faster debugging and lowers the cost of diagnosing failed tasks. + +You do **not** need to interact with MailHog directly, CosiDAG DAGs just benefit from it. + +--- + +## When to Use CosiDAG + +CosiDAG is ideal when your workflow: + +* **should react to new data** appearing in a filesystem, +* requires robust **folder validation**, +* should **run analysis scripts in external Python environments**, +* must be: + - **easy to maintain** + - **extend** + - **reuse across different scientific pipelines** + +Examples include: + +* TSMap pipeline +* Light Curve pipeline +* SimData ingestion +* Any workflow triggered by incoming instrument data + +--- + +## Summary + +* CosiDAG pipelines are reusable and configurable directly from the Airflow UI +* No need to modify the DAG script to process new datasets +* Processed folders are stored in `COSIDAG_PROCESSED::` +* CLI commands allow viewing, clearing, or deleting this history +* The automatic retrigger step can be disabled to purposely re-run old folders \ No newline at end of file diff --git a/modules/cosidag.py b/modules/cosidag.py new file mode 100644 index 0000000..f2a3366 --- /dev/null +++ b/modules/cosidag.py @@ -0,0 +1,861 @@ +""" +COSIDAG — a convenience DAG subclass that wires a standard layout: + + 1) check_new_file -> 2) automatic_retrig -> 3) resolve_inputs -> 4) [custom tasks] -> 5) show_results + +This implementation supports disabling optional steps: + +- check_new_file is NOT created if monitoring_folders is empty / None. +- automatic_retrig is NOT created if auto_retrig is False. + +The chaining logic adapts automatically depending on which tasks exist. + +Notes +------ +* Requires Airflow 2.x. +* Environment: define COSIFLOW_HOME_URL (in your .env) to point to the web UI homepage. +* State: a Variable named f"COSIDAG_PROCESSED::{dag_id}" is used to track processed + folder paths across runs, to avoid reprocessing the same folder. + * To clear the processed folder paths, delete the Variable, with the command: + airflow variables set COSIDAG_PROCESSED::{cosidag_id} [] +* Date queries: use date_queries (e.g. '>=2025-11-01' or ['>=2025-11-01','<=2025-11-05']). +* Only basename: if only_basename is provided, it only accepts subfolders with the given basename. +* Prefer deepest: if prefer_deepest is True, it prefers the deepest subfolder. +* File patterns: if file_patterns is provided, it searches for files matching the given patterns + using glob recursion and selects according to select_policy. +* Path helper: if available, the module cosiflow.modules.path is used to parse/build + URL fragments from detected folders. The code degrades gracefully if not found. +""" +from __future__ import annotations + +import os +import json +import re +import time +from datetime import datetime +from typing import Callable, Iterable, Optional, Sequence + +from airflow import DAG +from airflow.models import Variable +from airflow.operators.empty import EmptyOperator +from airflow.operators.python import PythonOperator +from airflow.sensors.python import PythonSensor +from airflow.utils.trigger_rule import TriggerRule +from airflow.operators.trigger_dagrun import TriggerDagRunOperator + +# ---- Optional path utils -------------------------------------------------------- +try: + from cosiflow.modules.path import PathInfo, build_url_fragment # type: ignore +except Exception: + PathInfo = None # type: ignore + build_url_fragment = None # type: ignore + +# ---- Import on-failure callback ------------------------------------------------- +import sys + +airflow_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow") +sys.path.append(os.path.join(airflow_home, "callbacks")) +sys.path.append(os.path.join(airflow_home, "modules")) +from on_failure_callback import notify_email # type: ignore + +from date_helper import _looks_like_date_folder, _parse_date_string, _apply_date_queries # type: ignore + +_BASE_DEFAULT_ARGS = { + "owner": "cosiflow", + "email_on_failure": True, + "on_failure_callback": notify_email, # from callbacks/on_failure_callback.py +} + +# --- Public config helpers (Airflow Variable -> ENV -> default) ----------------- +try: + from airflow.models import Variable as _AFVariable +except Exception: + _AFVariable = None + + +def cfg(key: str, default=None): + """Read config from Airflow Variable, then ENV, else default.""" + val = None + if _AFVariable is not None: + try: + val = _AFVariable.get(key) + except Exception: + val = None + if val is None: + val = os.environ.get(key, default) + return val + + +def cfg_int(key: str, default: int) -> int: + v = cfg(key, default) + try: + return int(v) + except Exception: + return default + + +def cfg_float(key: str, default: float) -> float: + v = cfg(key, default) + try: + return float(v) + except Exception: + return default + + +def cfg_bool(key: str, default: bool = False) -> bool: + v = cfg(key, None) + if isinstance(v, bool): + return v + if v is None: + return default + return str(v).strip().lower() in {"1", "true", "t", "yes", "y", "on"} + + +# ----- Helper functions (MUST stay at module top-level) -------------------------- + + +def _dir_stats(path: str): + """Return (count, total_size_bytes, latest_mtime) across all files under path.""" + count = 0 + total = 0 + latest = 0.0 + for root, _, files in os.walk(path): + for fn in files: + fp = os.path.join(root, fn) + try: + st = os.stat(fp) + except FileNotFoundError: + continue + count += 1 + total += st.st_size + if st.st_mtime > latest: + latest = st.st_mtime + return count, total, latest + + +def _is_dir_stable(path: str, idle_seconds: int, min_files: int) -> bool: + """True if dir has >= min_files and last write is older than idle_seconds.""" + count, _, latest = _dir_stats(path) + if count < min_files: + return False + return (time.time() - latest) >= idle_seconds + + +def _normalize_folders(monitoring_folders: Iterable[str]) -> Sequence[str]: + """Return absolute existing directories; ignore non-existing.""" + if isinstance(monitoring_folders, (str, os.PathLike)): + candidates = [str(monitoring_folders)] + else: + candidates = [str(p) for p in monitoring_folders] + out = [] + for p in candidates: + ap = os.path.abspath(os.path.expanduser(p)) + if os.path.isdir(ap): + out.append(ap) + return out + + +def _iter_subfolders(root: str, max_depth: int) -> Iterable[str]: + """Yield subfolders under root up to max_depth (depth 1 = direct children).""" + root_depth = root.rstrip(os.sep).count(os.sep) + for current_root, dirs, _ in os.walk(root): + current_depth = current_root.rstrip(os.sep).count(os.sep) - root_depth + if current_depth > max_depth: + dirs[:] = [] + continue + if current_depth >= 1: + yield current_root + + +def _date_filter_ok(path: str, date_queries) -> bool: + """ + Accept path if its 'reference date' (folder name or mtime) satisfies ALL queries. + + date_queries can be: + - None -> always True + - string like '>=2025-11-01' + - list of strings ['>=2025-11-01', '<=2025-11-05'] + """ + print(f"[COSIDAG] _date_filter_ok: path={path}, date_queries={date_queries}") + if not date_queries: + return True + + last = os.path.basename(os.path.normpath(path)) + + # 1) Try parsing date from folder name (YYYYMMDD[_...] or YYYY-MM-DD[_...]) + ref_date = None + if _looks_like_date_folder(last): + ds = last.split("_")[0] + try: + ref_date = _parse_date_string(ds) + except Exception as e: + print(f"[COSIDAG] _date_filter_ok: failed to parse folder date {ds!r}: {e}") + + # 2) Fallback to mtime date + if ref_date is None: + try: + ref_date = datetime.fromtimestamp(os.stat(path).st_mtime).date() + except Exception as e: + print(f"[COSIDAG] _date_filter_ok: failed to get mtime for {path}: {e}") + # If we cannot determine a reference date, do not filter out for safety. + return True + + return _apply_date_queries(ref_date, date_queries) + + +def _load_processed_set(dag_id: str) -> set: + """Load processed paths set from Airflow Variable.""" + key = f"COSIDAG_PROCESSED::{dag_id}" + raw = Variable.get(key, default_var="[]") + try: + return set(json.loads(raw)) + except Exception: + return set() + + +def _save_processed_set(dag_id: str, processed: set) -> None: + """Save processed paths set to Airflow Variable.""" + key = f"COSIDAG_PROCESSED::{dag_id}" + Variable.set(key, json.dumps(sorted(processed))) + + +def _find_new_folder( + monitoring_folders: Iterable[str], + level: int, + dag_id: str, + date_queries: Optional[str | list[str]] = None, + only_basename: Optional[str] = None, + prefer_deepest: bool = True, +) -> Optional[str]: + """Return the first new folder across roots (filtered & depth-limited).""" + print( + "[COSIDAG] _find_new_folder: searching for new folders " + f"(dag_id={dag_id}, level={level}, date_queries={date_queries}, only_basename={only_basename})" + ) + roots = _normalize_folders(monitoring_folders) + if not roots: + print("[COSIDAG] _find_new_folder: no valid monitoring folders found") + return None + + print(f"[COSIDAG] _find_new_folder: monitoring {len(roots)} root folder(s): {', '.join(roots)}") + processed = _load_processed_set(dag_id) + print(f"[COSIDAG] _find_new_folder: loaded {len(processed)} already processed folder(s)") + + candidates: list[str] = [] + for root in sorted(roots): + subfolders = list(_iter_subfolders(root, max_depth=level)) # materialize once + print(f"[COSIDAG] _find_new_folder: found {len(subfolders)} subfolder(s) in {root} (max_depth={level})") + for sub in subfolders: + if only_basename and os.path.basename(sub) != only_basename: + continue + if _date_filter_ok(sub, date_queries): + candidates.append(sub) + + if not candidates: + print("[COSIDAG] _find_new_folder: no candidates found after filtering") + return None + + print(f"[COSIDAG] _find_new_folder: {len(candidates)} candidate folder(s) after filtering") + + # Prefer deeper paths first + if prefer_deepest: + candidates.sort(key=lambda p: (p.count(os.sep), p), reverse=True) + print("[COSIDAG] _find_new_folder: sorted candidates by depth (deepest first)") + else: + candidates.sort() + print("[COSIDAG] _find_new_folder: sorted candidates alphabetically") + + for path in candidates: + if path not in processed: + print(f"[COSIDAG] _find_new_folder: found new folder: {path}") + return path + + print(f"[COSIDAG] _find_new_folder: all {len(candidates)} candidate(s) already processed") + return None + + + +class ConditionalTriggerDagRunOperator(TriggerDagRunOperator): + """ + Wraps TriggerDagRunOperator to skip execution if 'auto_retrig' is False in dag_run.conf. + Also supports max_retrig_runs to limit the number of automatic retriggers. + """ + + def __init__(self, max_retrig_runs=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.max_retrig_runs = max_retrig_runs + + def execute(self, context): + dag_run = context.get("dag_run") + conf = (dag_run.conf or {}) if dag_run else {} + + # Check runtime override + val = conf.get("auto_retrig") + if val is not None: + is_on = val + if isinstance(val, str): + is_on = val.strip().lower() in {"1", "true", "t", "yes", "y", "on"} + + if not is_on: + print(f"[COSIDAG] Skipping automatic_retrig (dag_run.conf['auto_retrig']={val})") + return None + + # Check run counter limit (before increment, which happens in the template) + if self.max_retrig_runs is not None: + run_count = conf.get("retrig_run_count", 0) + run_count = int(run_count) if isinstance(run_count, (int, str)) else 0 + + if run_count >= self.max_retrig_runs: + print(f"[COSIDAG] Skipping automatic_retrig (reached max_retrig_runs={self.max_retrig_runs}, current_count={run_count})") + return None + + return super().execute(context) + + +# ---- COSIDAG -------------------------------------------------------------------- + + +class COSIDAG(DAG): + """ + DAG subclass that wires: + check_new_file -> automatic_retrig -> resolve_inputs -> [custom] -> show_results + + Optional steps can be disabled: + - check_new_file is not created if monitoring_folders is empty. + - automatic_retrig is not created if auto_retrig is False. + """ + + def __init__( + self, + monitoring_folders, + level: int = 1, + date: Optional[str] = None, + date_queries: Optional[str | list[str]] = None, + build_custom: Optional[Callable[[DAG], None]] = None, + sensor_poke_seconds: int = 30, + sensor_timeout_seconds: int = 60 * 60 * 6, + home_env_var: str = "COSIFLOW_HOME_URL", + idle_seconds: int = 20, + min_files: int = 1, + ready_marker: Optional[str] = None, + only_basename: Optional[str] = None, + prefer_deepest: bool = True, + file_patterns: Optional[dict] = None, # {"xcom_key": "glob_pattern", ...} + select_policy: str = "first", # "first" | "latest_mtime" + tags: Optional[list[str]] = None, + default_args_extra: Optional[dict] = None, + auto_retrig: bool = True, + max_retrig_runs: Optional[int] = None, + *args, + **kwargs, + ) -> None: + # --- merge default_args --- + # priority: kwargs.default_args < _BASE_DEFAULT_ARGS < default_args_extra + base = dict(_BASE_DEFAULT_ARGS) + if "default_args" in kwargs and kwargs["default_args"]: + base.update(kwargs["default_args"]) # allows override from caller + if default_args_extra: + base.update(default_args_extra) # extensions/override requested + + # ensure that DAG receives the final default_args + kwargs["default_args"] = base + + # --- merge tags --- + existing_tags = list(kwargs.get("tags", []) or []) + merged_tags = sorted(set((tags or []) + existing_tags)) + if merged_tags: + kwargs["tags"] = merged_tags + + super().__init__(*args, **kwargs) + + # Decide whether monitoring is enabled (task existence, not just runtime behavior). + self.has_monitoring = bool(monitoring_folders) + + # Base params (can be overridden by dag_run.conf at runtime) + self.params.update( + { + "monitoring_folders": monitoring_folders, + "level": int(level), + "date": date, + "date_queries": date_queries, + "home_env_var": home_env_var, + "idle_seconds": int(idle_seconds), + "min_files": int(min_files), + "ready_marker": ready_marker, + "only_basename": only_basename, + "prefer_deepest": bool(prefer_deepest), + "file_patterns": file_patterns, + "select_policy": select_policy, + "max_active_runs": int(kwargs.get("max_active_runs", 2)), + "max_active_tasks": int(kwargs.get("max_active_tasks", 8)), + "concurrency": int(kwargs.get("concurrency", 8)), + "auto_retrig": bool(auto_retrig), + "max_retrig_runs": max_retrig_runs, + } + ) + + self.auto_retrig = bool(auto_retrig) + self.max_retrig_runs = max_retrig_runs + + print( + "[COSIDAG] enabled: " + f"check_new_file={self.has_monitoring}, " + f"automatic_retrig={self.auto_retrig}, " + f"resolve_inputs={bool(file_patterns)}" + ) + + # --------------------------------------------------------------------- + # 1) check_new_file — PythonSensor (optional) + # --------------------------------------------------------------------- + + def _sensor_poke(ti, **context): + conf = (context.get("dag_run").conf or {}) if context.get("dag_run") else {} + monitoring = conf.get("monitoring_folders", self.params["monitoring_folders"]) + level_val = int(conf.get("level", self.params["level"])) + + # Date queries: runtime conf has precedence. + conf_date_queries = conf.get("date_queries", None) + if conf_date_queries is None: + # fallback: use the optional "date" as '==date' + conf_date = conf.get("date", self.params.get("date")) + if conf_date: + conf_date_queries = f"=={conf_date}" + else: + conf_date_queries = self.params.get("date_queries") + + idle_s = int(conf.get("idle_seconds", self.params.get("idle_seconds", 20))) + min_f = int(conf.get("min_files", self.params.get("min_files", 1))) + marker = conf.get("ready_marker", self.params.get("ready_marker")) + only_bn = conf.get("only_basename", self.params.get("only_basename")) + prefer_deep = bool(conf.get("prefer_deepest", self.params.get("prefer_deepest", True))) + + new_path = _find_new_folder( + monitoring_folders=monitoring, + level=level_val, + date_queries=conf_date_queries, + dag_id=self.dag_id, + only_basename=only_bn, + prefer_deepest=prefer_deep, + ) + + print(f"[COSIDAG] _sensor_poke: new_path={new_path}") + if not new_path: + return False + + print(f"[COSIDAG] _sensor_poke: marker={marker}") + if marker: + marker_path = os.path.join(new_path, marker) + if not os.path.exists(marker_path): + return False + + print(f"[COSIDAG] _sensor_poke: idle_seconds={idle_s}, min_files={min_f}") + if not _is_dir_stable(new_path, idle_seconds=idle_s, min_files=min_f): + return False + + print("[COSIDAG] _sensor_poke: pushing detected_folder to XCom") + ti.xcom_push(key="detected_folder", value=new_path) + processed = _load_processed_set(self.dag_id) + processed.add(new_path) + _save_processed_set(self.dag_id, processed) + return True + + check_new_file = None + if self.has_monitoring: + check_new_file = PythonSensor( + task_id="check_new_file", + poke_interval=sensor_poke_seconds, + timeout=sensor_timeout_seconds, + mode="poke", + python_callable=_sensor_poke, + dag=self, + ) + self.check_new_file = check_new_file + else: + print("[COSIDAG] monitoring_folders empty → check_new_file disabled") + self.check_new_file = None + + # --------------------------------------------------------------------- + # 2) automatic_retrig — Trigger this same DAG again (optional) + # --------------------------------------------------------------------- + + def _unique_run_id() -> str: + ts = datetime.utcnow().strftime("%Y%m%dT%H%M%S%fZ") + return f"auto::{self.dag_id}::{ts}" + + automatic_retrig = None + if self.auto_retrig: + import inspect + + trig_kwargs = { + "task_id": "automatic_retrig", + "trigger_dag_id": self.dag_id, + "reset_dag_run": False, + "wait_for_completion": False, + "dag": self, + } + + # Propagate conf from previous run — must be valid JSON. + # Include logic to increment retrig_run_count if max_retrig_runs is set + if self.max_retrig_runs is not None: + trig_kwargs["conf"] = """{% set current_conf = dag_run.conf if dag_run and dag_run.conf else {} %} +{% set run_count = current_conf.get('retrig_run_count', 0) | int %} +{% set new_conf = current_conf.copy() %} +{% set _ = new_conf.update({'retrig_run_count': run_count + 1}) %} +{{ new_conf | tojson }}""" + else: + trig_kwargs["conf"] = "{{ dag_run.conf | tojson if dag_run and dag_run.conf else '{}' }}" + + # Airflow version differences + params = inspect.signature(TriggerDagRunOperator.__init__).parameters + if "trigger_run_id" in params: + trig_kwargs["trigger_run_id"] = _unique_run_id() + elif "run_id" in params: + trig_kwargs["run_id"] = _unique_run_id() + + trig_kwargs["max_retrig_runs"] = self.max_retrig_runs + automatic_retrig = ConditionalTriggerDagRunOperator(**trig_kwargs) + self.automatic_retrig = automatic_retrig + else: + self.automatic_retrig = None + + # --------------------------------------------------------------------- + # 3) resolve_inputs (optional) + # --------------------------------------------------------------------- + resolve_inputs = None + if file_patterns: + import glob + from airflow.exceptions import AirflowFailException + + def _resolve_inputs(**context): + ti = context["ti"] + dag_run = context.get("dag_run") + conf = (dag_run.conf or {}) if dag_run else {} + + # Prefer XCom from check_new_file, but allow manual runs by passing detected_folder in conf. + run_dir = None + if check_new_file is not None: + run_dir = ti.xcom_pull(task_ids="check_new_file", key="detected_folder") + if not run_dir: + run_dir = conf.get("detected_folder") + + if not run_dir or not os.path.isdir(run_dir): + raise AirflowFailException(f"[resolve_inputs] invalid run_dir: {run_dir}") + + def pick_one(paths: list[str]) -> Optional[str]: + if not paths: + return None + if select_policy == "first": + return sorted(paths)[0] + if select_policy == "latest_mtime": + return max(paths, key=lambda p: os.stat(p).st_mtime) + return sorted(paths)[0] + + def _can_open_file(file_path: str) -> bool: + """Try to open the file in read mode. Returns True if successful, False otherwise.""" + if not os.path.exists(file_path): + return False + try: + # Try to open the file in read mode + # This will fail if the file is still being written or locked by another process + with open(file_path, 'rb') as f: + # Try to read at least one byte to ensure file is readable + f.read(1) + return True + except (IOError, OSError, PermissionError, FileNotFoundError) as e: + print(f"[resolve_inputs] Cannot open file {file_path}: {e}") + return False + + # First pass: find all required files + found_files = {} + for key, pattern in file_patterns.items(): + matches = sorted(glob.glob(os.path.join(run_dir, "**", pattern), recursive=True)) + chosen = pick_one(matches) + if not chosen: + raise AirflowFailException( + f"[resolve_inputs] no file for key={key!r} pattern={pattern!r} under {run_dir}" + ) + found_files[key] = chosen + print(f"[resolve_inputs] Found {key} = {chosen}") + + # Second pass: wait for all files to be completely written (can be opened) + print(f"[resolve_inputs] Waiting for all {len(found_files)} files to be completely written...") + retry_interval = 120 # Wait 2 minutes between retries + max_wait_seconds = 1800 # Maximum 30 minutes total wait + start_time = time.time() + attempt = 0 + + while (time.time() - start_time) < max_wait_seconds: + attempt += 1 + all_ready = True + unready_files = [] + ready_files = [] + + # Check each file individually + for key, file_path in found_files.items(): + print(f"[resolve_inputs] Checking file {key}: {file_path}") + if _can_open_file(file_path): + ready_files.append(key) + print(f"[resolve_inputs] ✓ File {key} is ready and can be opened") + else: + all_ready = False + unready_files.append(key) + print(f"[resolve_inputs] ✗ File {key} is still being written or locked") + + if all_ready: + print(f"[resolve_inputs] All {len(found_files)} files are ready and can be opened (attempt {attempt})") + break + + elapsed = time.time() - start_time + print(f"[resolve_inputs] Attempt {attempt}: {len(ready_files)}/{len(found_files)} files ready. " + f"Still waiting for: {unready_files}. " + f"Elapsed: {elapsed:.1f}s. Retrying in {retry_interval}s...") + time.sleep(retry_interval) + else: + # Timeout reached - check each file one more time to report final status + print(f"[resolve_inputs] Timeout reached. Checking final status of all files...") + still_unready = [] + for key, file_path in found_files.items(): + if not _can_open_file(file_path): + still_unready.append(key) + print(f"[resolve_inputs] ✗ File {key} ({file_path}) still cannot be opened") + + if still_unready: + raise AirflowFailException( + f"[resolve_inputs] Timeout ({max_wait_seconds}s) waiting for files to be ready. " + f"Files still cannot be opened: {still_unready}" + ) + + # All files are ready, push to XCom + for key, file_path in found_files.items(): + ti.xcom_push(key=key, value=file_path) + print(f"[resolve_inputs] {key} = {file_path} (ready)") + + # Also republish run_dir for convenience. + ti.xcom_push(key="run_dir", value=run_dir) + print(f"[resolve_inputs] run_dir = {run_dir}") + + resolve_inputs = PythonOperator( + task_id="resolve_inputs", + python_callable=_resolve_inputs, + dag=self, + ) + + # --------------------------------------------------------------------- + # 4) [custom] — Let users append their tasks (optional) + # --------------------------------------------------------------------- + + before_tasks = set(self.task_dict.keys()) + if callable(build_custom): + build_custom(self) + after_tasks = set(self.task_dict.keys()) + new_ids = sorted(after_tasks - before_tasks) + + if new_ids: + new_set = set(new_ids) + new_tasks = [self.task_dict[t] for t in new_ids] + + roots, leaves = [], [] + for t in new_tasks: + ups = {u.task_id for u in t.upstream_list} + if ups.isdisjoint(new_set): + roots.append(t) + for t in new_tasks: + downs = {d.task_id for d in t.downstream_list} + if downs.isdisjoint(new_set): + leaves.append(t) + + last_custom = EmptyOperator(task_id="custom_anchor", dag=self) + for t in leaves: + t >> last_custom + else: + # No custom tasks created + roots = [] + last_custom = EmptyOperator(task_id="custom_placeholder", dag=self) + + # --------------------------------------------------------------------- + # 5) show_results — Log homepage and optional deep link + # --------------------------------------------------------------------- + + def _show_results(**context): + ti = context["ti"] + dag_run = context.get("dag_run") + conf = (dag_run.conf or {}) if dag_run else {} + + # ------------------------------------------------- + # 1) Retrieve detected folder + # ------------------------------------------------- + detected = None + + if check_new_file is not None: + detected = ti.xcom_pull( + task_ids="check_new_file", + key="detected_folder" + ) + else: + detected = ti.xcom_pull( + key="detected_folder" + ) + + # Allow manual runs + if not detected: + detected = conf.get("detected_folder") + + if not detected: + print( + "[COSIDAG] No detected folder available " + "(monitoring disabled and no dag_run.conf['detected_folder'])" + ) + return None + + # ------------------------------------------------- + # 2) Build deep-link URL (if possible) + # ------------------------------------------------- + homepage = os.environ.get( + self.params.get("home_env_var", "COSIFLOW_HOME_URL") + ) or os.environ.get("COSIFLOW_HOME_URL") + + url = None + if homepage: + # ⚠️ Adapt this base path to your filesystem layout + DATA_ROOT = "/home/gamma/workspace/data" + rel = detected.replace(DATA_ROOT, "").lstrip("/") + url = f"{homepage.rstrip('/')}/folder/{rel}" + + # ------------------------------------------------- + # 3) Push structured result to XCom (canonical output) + # ------------------------------------------------- + result = { + "folder": detected, + "url": url, + } + + ti.xcom_push( + key="cosidag_result", + value=result, + ) + + # ------------------------------------------------- + # 4) Human-friendly logs + # ------------------------------------------------- + print("=" * 80) + print("📂 COSIDAG RESULT") + print(f"Folder: {detected}") + if url: + print(f"URL: {url}") + else: + print("URL: ") + print("=" * 80) + + # ------------------------------------------------- + # 5) Optional deep-link via PathInfo (fallback / enrichment) + # ------------------------------------------------- + if not url and PathInfo is not None: + try: + info = PathInfo.from_path(detected) # type: ignore[attr-defined] + if callable(build_url_fragment) and homepage: + frag = build_url_fragment(info) # type: ignore + deep = f"{homepage.rstrip('/')}/{frag.lstrip('/')}" + print(f"[COSIDAG] Result page (PathInfo): {deep}") + except Exception as e: + print(f"[COSIDAG] Deep-linking via PathInfo failed: {e}") + + # ------------------------------------------------- + # 6) Return value (kept for backward compatibility) + # ------------------------------------------------- + return result + + + show_results = PythonOperator( + task_id="show_results", + python_callable=_show_results, + trigger_rule=TriggerRule.ALL_DONE, + dag=self, + ) + + # --------------------------------------------------------------------- + # Wiring — Build a robust chain depending on what exists. + # --------------------------------------------------------------------- + + # Start anchor: the last "pre-custom" task that exists. + anchor = None + + if check_new_file is not None and automatic_retrig is not None: + check_new_file >> automatic_retrig + anchor = automatic_retrig + elif check_new_file is not None: + anchor = check_new_file + elif automatic_retrig is not None: + # Note: without check_new_file, retrigger is still allowed (manual DAG that loops), + # but it is usually not recommended unless you pass detected_folder in conf. + anchor = automatic_retrig + + # Optional resolve_inputs comes after anchor if anchor exists; otherwise it can run standalone. + if resolve_inputs is not None: + if anchor is not None: + anchor >> resolve_inputs + anchor = resolve_inputs + + # Attach custom roots after anchor if possible. + if roots and anchor is not None: + for t in roots: + anchor >> t + + # Close chain into show_results + if anchor is not None: + anchor >> last_custom >> show_results + else: + # No monitoring, no retrigger, no resolve_inputs: run custom (or placeholder) then show results. + last_custom >> show_results + + # Expose handles + self.show_results = show_results + + def find_file_by_pattern(self, pattern: str, detected_folder: str) -> Optional[str]: + """Find the first file matching the given regex pattern under detected_folder.""" + print(f"[COSIDAG] find_file_by_pattern: pattern={pattern}, detected_folder={detected_folder}") + rx = re.compile(pattern) + for root, _, files in os.walk(detected_folder): + for fname in files: + if rx.search(fname): + return os.path.join(root, fname) + return None + + +# ------------------------------ Example usage ------------------------------------ +# Put the following into your DAG file under the Airflow 'dags/' directory. +# +# from datetime import datetime +# from cosiflow.cosidag import COSIDAG +# from airflow.operators.python import PythonOperator +# +# def build_custom(dag): +# def _process_folder(folder_path: str): +# print(f"Processing folder: {folder_path}") +# +# PythonOperator( +# task_id="custom_process", +# python_callable=lambda ti, **_: _process_folder( +# ti.xcom_pull(task_ids="check_new_file", key="detected_folder") +# ), +# dag=dag, +# ) +# +# with COSIDAG( +# dag_id="cosipipe_example", +# start_date=datetime(2025, 1, 1), +# schedule_interval=None, +# catchup=False, +# monitoring_folders=["/data/incoming", "/data/alt"], +# level=3, +# only_basename="products", +# idle_seconds=30, +# min_files=1, +# date=None, +# auto_retrig=False, # disable retrigger +# build_custom=build_custom, +# ) as dag: +# pass +# +# Manual run without monitoring: +# monitoring_folders=[] +# and trigger with dag_run.conf = {"detected_folder": "/path/to/process"} diff --git a/modules/date_helper.py b/modules/date_helper.py new file mode 100644 index 0000000..d4579d4 --- /dev/null +++ b/modules/date_helper.py @@ -0,0 +1,61 @@ +from datetime import datetime, date +import re +import os + + +def _looks_like_date_folder(name: str) -> bool: + return bool( + re.match(r"^\d{8}(?:_|$)", name) or re.match(r"^\d{4}-\d{2}-\d{2}(?:_|$)", name) + ) + +def _parse_date_string(s: str) -> date: + """Parse 'YYYYMMDD' or 'YYYY-MM-DD' to datetime.date.""" + s = s.strip() + if re.match(r"^\d{8}$", s): + return datetime.strptime(s, "%Y%m%d").date() + return datetime.strptime(s, "%Y-%m-%d").date() + + +def _parse_date_query(q: str): + """ + Parse a query like '>=2025-11-01' to (op, date). + op ∈ {'==', '>=', '<=', '>', '<'}; if missing → '=='. + """ + q = q.strip() + op = "==" + for candidate in ("==", ">=", "<=", ">", "<"): + if q.startswith(candidate): + op = candidate + q = q[len(candidate):].strip() + break + d = _parse_date_string(q) + return op, d + + +def _apply_date_queries(d: date, queries) -> bool: + """Return True if the date d satisfies ALL queries.""" + if not queries: + return True + + if isinstance(queries, str): + queries = [queries] + + for q in queries: + try: + op, target = _parse_date_query(q) + except Exception as e: + print(f"[COSIDAG] _apply_date_queries: parse error for {q!r}: {e}; ignoring condition") + continue + + if op == "==" and not (d == target): + return False + if op == ">=" and not (d >= target): + return False + if op == "<=" and not (d <= target): + return False + if op == ">" and not (d > target): + return False + if op == "<" and not (d < target): + return False + + return True diff --git a/modules/paths.py b/modules/paths.py new file mode 100644 index 0000000..96ad355 --- /dev/null +++ b/modules/paths.py @@ -0,0 +1,148 @@ +from __future__ import annotations +import os +import re +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Optional, Tuple, Iterable + +# === Config === +DATA_ROOT = Path(os.getenv("COSIFLOW_DATA_ROOT", "cosi/data")) +YYYY_MM_RE = re.compile(r"^(?P\d{4})_(?P0[1-9]|1[0-2])$") + +# === Enums === +class Domain(str, Enum): + obs = "obs" + transient = "transient" + trigger = "trigger" + maps = "maps" + source = "source" + +class ObsLeaf(str, Enum): + auxil = "auxil" + compton = "compton" + acs = "acs" + bto = "bto" + +class CommonLeaf(str, Enum): + plots = "plots" + products = "products" + +# === Dataclass === +@dataclass(frozen=True) +class PathInfo: + domain: Domain + year: Optional[int] = None + month: Optional[int] = None + entity_id: Optional[str] = None + leaf: Optional[str] = None + remainder: Tuple[str, ...] = () + +# === Helpers === +def _ym(year: int, month: int) -> str: + if not (1 <= month <= 12): + raise ValueError(f"Invalid month: {month}") + return f"{year:04d}_{month:02d}" + +def ensure_dir(p: Path) -> Path: + p.mkdir(parents=True, exist_ok=True) + return p + +def list_files(p: Path, glob: str = "*") -> Iterable[Path]: + return p.glob(glob) + +# === Builder specifici === +def obs_leaf_path(year: int, month: int, obs_id: str, leaf: ObsLeaf, *rel: str) -> Path: + return DATA_ROOT / "obs" / _ym(year, month) / obs_id / leaf.value / Path(*rel) + +def transient_path(year: int, month: int, transient_id: str, leaf: CommonLeaf, *rel: str) -> Path: + return DATA_ROOT / "transient" / _ym(year, month) / transient_id / leaf.value / Path(*rel) + +def trigger_path(year: int, month: int, trigger_id: str, leaf: CommonLeaf, *rel: str) -> Path: + return DATA_ROOT / "trigger" / _ym(year, month) / trigger_id / leaf.value / Path(*rel) + +def maps_path(year: int, month: int, *rel: str) -> Path: + return DATA_ROOT / "maps" / _ym(year, month) / Path(*rel) + +def source_path(src_id: str, year: int, month: int, leaf: CommonLeaf, *rel: str) -> Path: + return DATA_ROOT / "source" / src_id / _ym(year, month) / leaf.value / Path(*rel) + +# === Builder generico === +def build_path( + domain: Domain, + *, + year: Optional[int] = None, + month: Optional[int] = None, + entity_id: Optional[str] = None, + leaf: Optional[str] = None, + rel: Tuple[str, ...] = (), +) -> Path: + """Costruisce un path conforme allo schema della figura.""" + if domain == Domain.obs: + if None in (year, month, entity_id, leaf): + raise ValueError("obs requires year, month, obs_id and leaf") + return obs_leaf_path(year, month, entity_id, ObsLeaf(leaf), *rel) + + if domain in (Domain.transient, Domain.trigger): + if None in (year, month, entity_id, leaf): + raise ValueError(f"{domain.value} requires year, month, id and leaf") + fn = transient_path if domain == Domain.transient else trigger_path + return fn(year, month, entity_id, CommonLeaf(leaf), *rel) + + if domain == Domain.maps: + if None in (year, month): + raise ValueError("maps requires year and month") + return maps_path(year, month, *rel) + + if domain == Domain.source: + if None in (entity_id, year, month, leaf): + raise ValueError("source requires src_id, year, month, leaf") + return source_path(entity_id, year, month, CommonLeaf(leaf), *rel) + + raise ValueError(f"Unsupported domain: {domain}") + +# === Parser === +def parse_path(p: Path) -> PathInfo: + """Interpreta un path e restituisce un PathInfo con dominio, anno, mese, id e leaf.""" + parts = p.parts[p.parts.index("data")+1:] if "data" in p.parts else p.parts + domain = Domain(parts[0]) + + if domain == Domain.obs: + ym, entity_id, leaf, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), entity_id, leaf, tuple(rem)) + + if domain in (Domain.transient, Domain.trigger): + ym, entity_id, leaf, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), entity_id, leaf, tuple(rem)) + + if domain == Domain.maps: + ym, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), None, None, tuple(rem)) + + if domain == Domain.source: + src_id, ym, leaf, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), src_id, leaf, tuple(rem)) + + raise ValueError(f"Unsupported domain: {domain}") + +# === Utility extra === +def file_path(domain: Domain, *, year=None, month=None, entity_id=None, leaf=None, filename: str) -> Path: + """Restituisce il path completo di un file con nome arbitrario.""" + dirpath = build_path(domain, year=year, month=month, entity_id=entity_id, leaf=leaf) + ensure_dir(dirpath) + return dirpath / filename + +def first_match(domain: Domain, *, year=None, month=None, entity_id=None, leaf=None, pattern: str = "*") -> Optional[Path]: + """Restituisce il primo file che combacia con un pattern nella directory canonica.""" + dirpath = build_path(domain, year=year, month=month, entity_id=entity_id, leaf=leaf) + return next(dirpath.glob(pattern), None) + +def route_dest_for(src: Path, domain: Domain, *, year=None, month=None, entity_id=None, leaf=None) -> Path: + """Costruisce il percorso di destinazione mantenendo il nome originale del file.""" + dirpath = build_path(domain, year=year, month=month, entity_id=entity_id, leaf=leaf) + ensure_dir(dirpath) + return dirpath / src.name diff --git a/pipeline/generate_plot.py b/pipeline/generate_plot.py deleted file mode 100644 index 5b2194f..0000000 --- a/pipeline/generate_plot.py +++ /dev/null @@ -1,42 +0,0 @@ -import sys,os -from cosipy.util import fetch_wasabi_file -from cosipy import BinnedData -from pathlib import Path - -#/home/gamma/workspace/heasarc/dl0/2025-01-24_14-16-50/GalacticScan.inc1.id1.crab2hr.extracted.tra.gz - -# create the inputs.yaml file to process the data. -print("test") -print(sys.argv[1]) -file_path = sys.argv[1] -dir_name = os.path.dirname(file_path) - -content_to_write = f"""#----------# -# Data I/O: - -# data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 -data_file: {file_path} # full path -ori_file: "NA" # full path -unbinned_output: 'hdf5' # 'fits' or 'hdf5' -time_bins: 60 # time bin size in seconds. Takes int, float, or list of bin edges. -energy_bins: [100., 200., 500., 1000., 2000., 5000.] # Takes list. Needs to match response. -phi_pix_size: 6 # binning of Compton scattering anlge [deg] -nside: 8 # healpix binning of psi chi local -scheme: 'ring' # healpix binning of psi chi local -tmin: 1835478000.0 # Min time cut in seconds. -tmax: 1835485200.0 # Max time cut in seconds. -#----------# -""" - -dir_name_path = Path(dir_name) - -# Open the file in write mode and write the content -with open(dir_name_path / "inputs.yaml", "w") as file: - file.write(content_to_write) - - -analysis = BinnedData(dir_name_path / "inputs.yaml") -analysis.read_tra(output_name = dir_name_path / "unbinned_data") -analysis.get_binned_data() -analysis.get_raw_spectrum(output_name = file_path.replace(".crab2hr.extracted.tra.gz","")) -analysis.get_raw_lightcurve(output_name = file_path.replace(".crab2hr.extracted.tra.gz","")) diff --git a/pipeline/initialize_pipeline.py b/pipeline/initialize_pipeline.py deleted file mode 100644 index d2a15eb..0000000 --- a/pipeline/initialize_pipeline.py +++ /dev/null @@ -1,18 +0,0 @@ -from cosipy.util import fetch_wasabi_file -import os -import shutil -from pathlib import Path - -# This script must be executed the first time we install this airflow app to obtain a file used to test the DAG - -home_dir = Path(os.environ['HOME']) -new_path = os.path.join(home_dir, "workspace", "data", "GalacticScan.inc1.id1.crab2hr.extracted.tra.gz") - -# Check if the file already exists -if os.path.exists(new_path): - print(f"File {new_path} already exists. Removing it to fetch a new one.") - # If the file exists, remove it - os.remove(new_path) - -fetch_wasabi_file(file='ComptonSphere/mini-DC2/GalacticScan.inc1.id1.crab2hr.extracted.tra.gz', - output=new_path) diff --git a/plugins/__init__.py b/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plugins/data_explorer/data_explorer_plugin.py b/plugins/data_explorer/data_explorer_plugin.py new file mode 100644 index 0000000..2b7b80b --- /dev/null +++ b/plugins/data_explorer/data_explorer_plugin.py @@ -0,0 +1,261 @@ +import os +import traceback +import base64 +import mimetypes +from pathlib import Path +from airflow.plugins_manager import AirflowPlugin +from airflow.models import BaseOperator +from flask import Blueprint, render_template, send_from_directory, redirect, url_for, session, jsonify, abort +from flask_appbuilder import BaseView, expose +from jinja2 import Environment +from flask_login import login_required, current_user + +# Get from the env variable COSI_DATA_DIR the path to the data directory if it is not set, use the default path +DL0_FOLDER = os.environ.get("COSI_DATA_DIR", "/home/gamma/workspace/data") + +# Definiamo il percorso assoluto alla cartella del plugin +plugin_folder = os.path.dirname(os.path.abspath(__file__)) + +# Blueprint con percorso assoluto a templates e static +# Usato solo per registrare il path dei template +heasarc_explorer_bp = Blueprint( + "heasarc_explorer_bp", + __name__, + template_folder=os.path.join(plugin_folder, "templates"), + static_folder=os.path.join(plugin_folder, "static"), + url_prefix='/heasarcbrowser' +) + +def get_content_type(filepath, mime_type): + """Determine content type based on file extension and mime type""" + if not mime_type: + mime_type = "application/octet-stream" + + # Image types + if mime_type.startswith('image/'): + return "image" + + # Text types + if (mime_type.startswith('text/') or + mime_type in ['application/json', 'application/xml', 'application/javascript']): + return "text" + + # Check by extension for common text files + ext = os.path.splitext(filepath)[1].lower() + text_extensions = ['.txt', '.py', '.js', '.html', '.css', '.json', '.xml', '.yaml', '.yml', '.md', '.log', '.csv'] + if ext in text_extensions: + return "text" + + # Check by extension for common image files + image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp'] + if ext in image_extensions: + return "image" + + return "binary" + +def get_file_icon(filename): + """Get appropriate icon for file type""" + ext = os.path.splitext(filename)[1].lower() + + # Image files + if ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp']: + return "🖼️" + + # Text files + if ext in ['.txt', '.md', '.log']: + return "📄" + + # Code files + if ext in ['.py']: + return "🐍" + elif ext in ['.js']: + return "📜" + elif ext in ['.html', '.htm']: + return "🌐" + elif ext in ['.css']: + return "🎨" + elif ext in ['.json']: + return "📋" + elif ext in ['.xml', '.yaml', '.yml']: + return "⚙️" + + # Data files + if ext in ['.csv']: + return "📊" + elif ext in ['.hdf5', '.h5']: + return "🗃️" + elif ext in ['.fits', '.fit']: + return "🔭" + + # Archive files + if ext in ['.zip', '.tar', '.gz', '.rar']: + return "📦" + + # Default + return "📄" + +class HEASARCExplorerView(BaseView): + default_view = "explorer_home" + route_base = "/heasarcbrowser" + + @expose('/') + def explorer_home(self): + if not current_user.is_authenticated: + return redirect('/login/?next=/heasarcbrowser/') + try: + folders = sorted([f for f in os.listdir(DL0_FOLDER) if os.path.isdir(os.path.join(DL0_FOLDER, f))]) + # Use self.render_template instead of flask.render_template + return self.render_template("explorer.html", folders=folders, current_path=DL0_FOLDER, get_file_icon=get_file_icon) + except PermissionError: + abort(403) + except Exception as e: + error_traceback = traceback.format_exc() + # If self.render_template fails (e.g. during an error), try to return a simple error response + # But the error 'appbuilder is undefined' suggests we might have issues even getting there if render_template is called + # Let's ensure we are using self.render_template which injects appbuilder + return f"Error loading folders: {e}\n\nTraceback:\n{error_traceback}", 500 + + @expose('/folder/') + @login_required + def explorer_folder(self, foldername): + try: + folder_path = os.path.join(DL0_FOLDER, foldername) + + # Check if the folder path is within the allowed directory + if not os.path.commonpath([DL0_FOLDER, folder_path]).startswith(DL0_FOLDER): + abort(403) + + # Check if the directory exists + if not os.path.exists(folder_path): + return self.render_template("explorer.html", + folders=[], + files=[], + foldername=foldername, + current_path=folder_path, + error_message=f"Directory '{foldername}' does not exist.", + get_file_icon=get_file_icon) + + # Check if the path is actually a directory + if not os.path.isdir(folder_path): + return self.render_template("explorer.html", + folders=[], + files=[], + foldername=foldername, + current_path=folder_path, + error_message=f"'{foldername}' is not a directory.", + get_file_icon=get_file_icon) + + # Show all files in the folder, not only pdfs + files = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]) + folders = sorted([f for f in os.listdir(folder_path) if not os.path.isfile(os.path.join(folder_path, f))]) + + # Add parent directory to folders list if we're not at root + if foldername: + parent_path = os.path.dirname(foldername) + if parent_path and parent_path != foldername: # Not at root + folders.insert(0, "..") # Add parent directory indicator + + return self.render_template("explorer.html", folders=folders, files=files, foldername=foldername, current_path=folder_path, get_file_icon=get_file_icon) + except PermissionError: + abort(403) + except Exception as e: + error_traceback = traceback.format_exc() + return f"Error loading files: {e}\n\nTraceback:\n{error_traceback}", 500 + + @expose('/download/') + @login_required + def download_file(self, filepath): + abs_path = os.path.join(DL0_FOLDER, filepath) + folder, filename = os.path.split(abs_path) + return send_from_directory(folder, filename, as_attachment=True) + + @expose('/preview/') + @login_required + def preview_file(self, filepath): + print(f"Preview request for: {filepath}") # Debug logging + try: + abs_path = os.path.join(DL0_FOLDER, filepath) + + # Security check - ensure path is within allowed directory + if not os.path.commonpath([DL0_FOLDER, abs_path]).startswith(DL0_FOLDER): + return jsonify({"error": "Access denied"}), 403 + + # Check if file exists + if not os.path.exists(abs_path): + return jsonify({"error": "File not found"}), 404 + + if not os.path.isfile(abs_path): + return jsonify({"error": "Not a file"}), 400 + + # Get file info + file_size = os.path.getsize(abs_path) + mime_type, _ = mimetypes.guess_type(abs_path) + + # Determine content type + content_type = get_content_type(abs_path, mime_type) + + # For large files, don't load them + if file_size > 10 * 1024 * 1024: # 10MB limit + return jsonify({ + "content_type": "binary", + "size": file_size, + "mime_type": mime_type or "application/octet-stream" + }) + + if content_type == "image": + # Load image as base64 + with open(abs_path, 'rb') as f: + content = base64.b64encode(f.read()).decode('utf-8') + return jsonify({ + "content_type": "image", + "content": content, + "mime_type": mime_type or "application/octet-stream", + "size": file_size + }) + + elif content_type == "text": + # Load text file + try: + with open(abs_path, 'r', encoding='utf-8') as f: + content = f.read() + # Limit text preview to first 50KB + if len(content) > 50000: + content = content[:50000] + "\n... (truncated)" + return jsonify({ + "content_type": "text", + "content": content, + "mime_type": mime_type or "text/plain", + "size": file_size + }) + except UnicodeDecodeError: + return jsonify({ + "content_type": "binary", + "size": file_size, + "mime_type": mime_type or "application/octet-stream" + }) + + else: + return jsonify({ + "content_type": "binary", + "size": file_size, + "mime_type": mime_type or "application/octet-stream" + }) + + except Exception as e: + return jsonify({"error": f"Error loading file: {str(e)}"}), 500 + +class DummyOperator(BaseOperator): + def execute(self, context): + pass + +class heasarcExplorerPlugin(AirflowPlugin): + name = "heasarc_explorer_plugin" + operators = [DummyOperator] + flask_blueprints = [heasarc_explorer_bp] + appbuilder_views = [ + { + "name": "heasarc Browser", + "category": "Results Browser", + "view": HEASARCExplorerView() + } + ] diff --git a/plugins/data_explorer/templates/explorer.html b/plugins/data_explorer/templates/explorer.html new file mode 100644 index 0000000..1df678e --- /dev/null +++ b/plugins/data_explorer/templates/explorer.html @@ -0,0 +1,563 @@ +{% extends "airflow/main.html" %} + +{% block title %}COSI Data Explorer{% endblock %} + +{% block head_css %} +{{ super() }} + +{% endblock %} + +{% block content %} +
+ +

Data File Browser

+ + + + + + {% if error_message %} +
+ Error: {{ error_message }} +
+ {% endif %} + + + {% if foldername %} + + {% endif %} + + +
+ +
+ + {% if folders and folders|length > 0 %} +

Folders:

+ + {% endif %} + + + {% if files and files|length > 0 %} +

Files in {{ foldername }}:

+
    + {% for file in files %} +
  • + + +
  • + {% endfor %} +
+ {% endif %} +
+ + +
+
+

File Preview

+

Click on a file to preview its contents

+
+
+
+

Select a file to preview its contents

+
+
+
+
+
+ + + +{% endblock %} diff --git a/plugins/mailhog_link/mailhog_link_plugin.py b/plugins/mailhog_link/mailhog_link_plugin.py new file mode 100644 index 0000000..48da7cb --- /dev/null +++ b/plugins/mailhog_link/mailhog_link_plugin.py @@ -0,0 +1,15 @@ +from flask import Blueprint, redirect +import os + +# Blueprint empty (no custom routing, we use only the link) +mailhog_bp = Blueprint( + "mailhog_bp", + __name__, + url_prefix="" +) + +@mailhog_bp.route('/') +def redirect_to_mailhog(): + # use the environment variable MAILHOG_WEBUI_URL if it is set, otherwise use the default value + mail_server = os.environ.get('MAILHOG_WEBUI_URL', 'http://localhost:8025') + return redirect(mail_server, code=302) \ No newline at end of file diff --git a/plugins/mailhog_link/mailhog_link_view_plugin.py b/plugins/mailhog_link/mailhog_link_view_plugin.py new file mode 100644 index 0000000..ba09daf --- /dev/null +++ b/plugins/mailhog_link/mailhog_link_view_plugin.py @@ -0,0 +1,23 @@ +from airflow.plugins_manager import AirflowPlugin +from flask import redirect +from flask_appbuilder import BaseView, expose +import os + +class MailhogView(BaseView): + default_view = "redirect_to_mailhog" + route_base = "/mailhog" + + @expose("/") + def redirect_to_mailhog(self): + mail_server = os.environ.get('MAILHOG_WEBUI_URL', 'http://localhost:8025') + return redirect(mail_server) + +class MailhogViewPlugin(AirflowPlugin): + name = "mailhog_view_plugin" + appbuilder_views = [ + { + "name": "Mailhog", + "category": "Develop tools", + "view": MailhogView() + } + ] \ No newline at end of file diff --git a/plugins/refresh_dags_list/README.md b/plugins/refresh_dags_list/README.md new file mode 100644 index 0000000..8b8213e --- /dev/null +++ b/plugins/refresh_dags_list/README.md @@ -0,0 +1,45 @@ +# Refresh DAGs List Plugin + +This plugin adds a menu item under "Develop tools" that executes the `airflow dags list` command and refreshes the DAG bag to update the DAGs list in the Airflow UI. + +## Features + +- Executes the `airflow dags list` command +- Forces a refresh of the DAG bag to update the list in the UI +- Adds a menu item under "Develop tools" dropdown menu + +## Installation + +The plugin is automatically loaded by Airflow when placed in the `plugins/` folder. + +## Usage + +1. Navigate to the Airflow web UI +2. Click on "Develop tools" in the top navigation bar +3. Click on "Refresh DAGs List" +4. The command will execute and you will be redirected to the home page with a success message +5. The DAGs list will be automatically refreshed + +## How It Works + +When you click on "Refresh DAGs List" in the menu: + +1. The plugin executes `airflow dags list` command +2. It creates a new `DagBag` instance and forces parsing of all DAGs +3. It syncs the DAG bag with the database to update the UI +4. You are redirected to the home page with a success message + +## Structure + +``` +refresh_dags_list/ +├── refresh_dags_plugin.py # Main plugin file +└── README.md # This file +``` + +## Notes + +- The plugin requires the user to be authenticated +- The `airflow dags list` command is executed in the context of the Airflow container +- The DAG bag refresh may take a few seconds to complete +- If there are any errors, they will be displayed as a warning message diff --git a/plugins/refresh_dags_list/refresh_dags_plugin.py b/plugins/refresh_dags_list/refresh_dags_plugin.py new file mode 100644 index 0000000..5aa6dbc --- /dev/null +++ b/plugins/refresh_dags_list/refresh_dags_plugin.py @@ -0,0 +1,107 @@ +import os +import subprocess +from flask import redirect, flash +from flask_appbuilder import BaseView, expose +from airflow.plugins_manager import AirflowPlugin +from airflow.models import DagBag +from airflow.utils.session import provide_session + + +class RefreshDagsView(BaseView): + """View that executes 'airflow dags list' command and refreshes the DAG bag""" + + default_view = "refresh_dags" + route_base = "/refresh_dags" + + @expose("/") + def refresh_dags(self): + """Execute 'airflow dags list' command and refresh DAG bag, then redirect to home""" + try: + # Execute the command + result = self._execute_dags_list() + + # Force refresh of the DAG bag + self._refresh_dagbag() + + # Show success message and redirect to home + if result.get('success', False): + flash("DAGs list refreshed successfully!", "success") + else: + flash(f"DAGs list refresh completed with warnings: {result.get('error', 'Unknown error')}", "warning") + + return redirect('/home') + except Exception as e: + flash(f"Error refreshing DAGs list: {str(e)}", "error") + return redirect('/home') + + def _execute_dags_list(self): + """Execute 'airflow dags list' command and return the output""" + try: + # Get AIRFLOW_HOME from environment + airflow_home = os.environ.get('AIRFLOW_HOME', '/home/gamma/airflow') + + # Execute the command + result = subprocess.run( + ['airflow', 'dags', 'list'], + capture_output=True, + text=True, + timeout=30, + env={**os.environ, 'AIRFLOW_HOME': airflow_home} + ) + + return { + 'success': result.returncode == 0, + 'output': result.stdout, + 'error': result.stderr + } + except subprocess.TimeoutExpired: + return { + 'success': False, + 'output': '', + 'error': 'Timeout: command took too long' + } + except Exception as e: + return { + 'success': False, + 'output': '', + 'error': f'Error during execution: {str(e)}' + } + + @provide_session + def _refresh_dagbag(self, session=None): + """Force refresh of the DAG bag to update the list in the UI""" + try: + # Get the DAGs folder path + dag_folder = os.environ.get( + 'AIRFLOW__CORE__DAGS_FOLDER', + os.path.join(os.environ.get('AIRFLOW_HOME', '/home/gamma/airflow'), 'dags') + ) + + # Create a new DagBag instance to force refresh + dagbag = DagBag(dag_folder=dag_folder, include_examples=False) + + # Force parsing of DAGs + dagbag.collect_dags() + + # Sync with database + dagbag.sync_to_db() + + return True + except Exception as e: + # Log the error but don't block execution + print(f"Warning: Error during DAG bag refresh: {str(e)}") + import traceback + traceback.print_exc() + return False + + +class RefreshDagsPlugin(AirflowPlugin): + name = "refresh_dags_plugin" + + appbuilder_views = [ + { + "name": "Refresh DAGs List", + "category": "Develop tools", + "view": RefreshDagsView() + } + ] diff --git a/plugins/reset_cosidag_link/reset_cosidag_plugin.py b/plugins/reset_cosidag_link/reset_cosidag_plugin.py new file mode 100644 index 0000000..005de2f --- /dev/null +++ b/plugins/reset_cosidag_link/reset_cosidag_plugin.py @@ -0,0 +1,84 @@ +import os +import traceback +import json +from flask import Blueprint, render_template, request, flash, redirect, url_for, current_app, jsonify +from flask_login import login_required +from flask_appbuilder import BaseView, expose +from airflow.plugins_manager import AirflowPlugin +from airflow.models import Variable, DagModel +from airflow.utils.session import provide_session + +# Define the absolute path to the plugin folder +plugin_folder = os.path.dirname(os.path.abspath(__file__)) + +# Blueprint to register templates and route +reset_cosidag_bp = Blueprint( + "reset_cosidag_bp", + __name__, + template_folder=os.path.join(plugin_folder, "templates"), + url_prefix="/reset_cosidag" +) + +@provide_session +def get_dag_ids(session=None): + dags = session.query(DagModel.dag_id).filter(DagModel.is_active == True).all() + return sorted([d.dag_id for d in dags]) + +class ResetCosidagView(BaseView): + default_view = "reset_cosidag" + route_base = "/reset_cosidag" + + @expose("/", methods=['GET', 'POST']) + @login_required + def reset_cosidag(self): + try: + if request.method == 'POST': + dag_id = request.form.get('dag_id') + if dag_id: + variable_key = f"COSIDAG_PROCESSED::{dag_id}" + try: + # Reset variable to empty list + Variable.set(variable_key, [], serialize_json=True) + flash(f"Successfully reset processed folders for {dag_id}. Variable {variable_key} set to [].", "success") + except Exception as e: + flash(f"Error resetting variable: {str(e)}", "error") + else: + flash("No DAG ID selected.", "error") + return redirect(url_for('ResetCosidagView.reset_cosidag')) + + dag_ids = get_dag_ids() + + # Using self.render_template automatically uses the correct appbuilder layout + return self.render_template("reset_cosidag.html", dag_ids=dag_ids) + except Exception as e: + return f"

Error in Reset Cosidag Plugin

{traceback.format_exc()}
", 500 + + @expose("/get_processed_folders/", methods=['GET']) + @login_required + def get_processed_folders(self, dag_id): + try: + variable_key = f"COSIDAG_PROCESSED::{dag_id}" + # Get the variable, default to empty list string if not found + val_str = Variable.get(variable_key, default_var="[]") + try: + val = json.loads(val_str) + if not isinstance(val, list): + val = [] + except json.JSONDecodeError: + val = [] + + return jsonify({"folders": val}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# Single Plugin Class +class ResetCosidagPlugin(AirflowPlugin): + name = "reset_cosidag_plugin" + flask_blueprints = [reset_cosidag_bp] # Register blueprint for templates path + appbuilder_views = [ + { + "name": "Reset Cosidag", + "category": "Develop tools", + "view": ResetCosidagView() + } + ] diff --git a/plugins/reset_cosidag_link/templates/reset_cosidag.html b/plugins/reset_cosidag_link/templates/reset_cosidag.html new file mode 100644 index 0000000..b1fc1c2 --- /dev/null +++ b/plugins/reset_cosidag_link/templates/reset_cosidag.html @@ -0,0 +1,135 @@ +{% extends "airflow/main.html" %} + +{% block title %}Reset COSIDAG - Airflow{% endblock %} + +{% block content %} +
+

Reset COSIDAG Processed State

+

+ This tool resets the COSIDAG_PROCESSED::{dag_id} variable for a specific COSIDAG.
+ This variable tracks processed folder paths across runs to avoid reprocessing. Resetting it will clear the history of processed folders, allowing the DAG to reprocess data if triggered. +

+ +
+ + {% with messages = get_flashed_messages(with_categories=true) %} + {% if messages %} + {% for category, message in messages %} +
+ {{ message }} +
+ {% endfor %} + {% endif %} + {% endwith %} + +
+
Reset Variable
+
+
+ {% if csrf_token %} + + {% endif %} +
+ + +
+ +
+
+
+ + +
+ + +{% endblock %} diff --git a/tutorials/README.md b/tutorials/README.md new file mode 100644 index 0000000..14aa9bc --- /dev/null +++ b/tutorials/README.md @@ -0,0 +1,199 @@ +# COSIfest Mini‑Tutorial — Building Two DAGs in Airflow +Cosiflow / COSI-AIRFLOW + +--- + +## 0) What we’ll build +- **Exercise 1 — Hello World DAG** + - `BashOperator` → create folder & `result.txt` + - `PythonOperator` → append `"Hello Wolrd!"` to the file +- **Exercise 2 — A & B** + - Two DAGs running “in parallel”, communicating via **filesystem** + - **A (ExternalPythonOperator in `cosipy`)**: render 48×48 text → factorize via SVD → save `A`, `B` and plots + - **B (ExternalPythonOperator + PythonSensor)**: wait for `factors.pkl` → reconstruct `A@B` → save plots + +--- + +## 1) Airflow basics recap +- **DAG**: Directed Acyclic Graph — a workflow +- **Task**: a node in the DAG (atomic step) +- **Operators**: + - `BashOperator`: run shell commands + - `PythonOperator`: run a Python callable in the **Airflow** runtime + - `ExternalPythonOperator`: run a Python callable in an **external interpreter** (Conda env) +- **Sensors**: tasks that **wait** for a condition (file exists, external task completes, etc.) + +--- + +## 2) Exercise 1 — Folder & file (BashOperator) +**Goal**: create `/home/gamma/workspace/data/tutorials/result.txt` +```python +from airflow.operators.bash import BashOperator + +make_file = BashOperator( + task_id="make_folder_and_file", + bash_command=( + "mkdir -p /home/gamma/workspace/data/tutorials && " + "touch /home/gamma/workspace/data/tutorials/result.txt" + ), +) +``` +**Tip**: use `mkdir -p` to be idempotent. + +--- + +## 3) Exercise 1 — Append text (PythonOperator) +**Goal**: append `"Hello Wolrd!"` (typo kept) into the file. +```python +from airflow.operators.python import PythonOperator +from pathlib import Path + +BASE = Path("/home/gamma/workspace/data/tutorials") +RESULT = BASE / "result.txt" + +def write_hello(): + with open(RESULT, "a", encoding="utf-8") as f: + f.write("Hello Wolrd!\n") + +with DAG( + dag_id="hello_world_dag", + default_args=default_args, + description="Minimal example: Bash touch + Python writes text", + start_date=datetime(2025, 1, 1), + schedule_interval=None, # run on-demand + catchup=False, + tags=["cosifest", "handson", "tutorials"], +) as dag: + + write_text = PythonOperator( + task_id="write_text", + python_callable=write_hello, + ) +``` +**Flow**: `make_file >> write_text` + +--- + +## 4) Run & verify Exercise 1 +- Trigger **hello_world_dag** +- Verify on the host/container: +``` +cat /home/gamma/workspace/data/tutorials/result.txt +``` +- You should see the line `Hello Wolrd!` appended. + +--- + +## 5) Why ExternalPythonOperator for Exercise 2? +- Isolate scientific dependencies in **Conda env** (here: `cosipy`): + - `EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python"` +- Clean separation between orchestration (Airflow) and heavy libs +- Caveat: **no Airflow context** inside external process (fine for this demo) + +--- + +## 6) Exercise 2 — Architecture +**A** (producer): +- Render multiline text into a tiny canvas → 0/1 matrix `X` +- SVD factorization: `X ≈ L @ R` +- Save: `factors.pkl` + plots `factor_L.png`, `factor_R.png` + +**B** (consumer): +- `PythonSensor` waits for `factors.pkl` +- Load `L`, `R`; compute `M = L @ R` +- Save `reconstruction_float.png` and `reconstruction_binary.png` + +Communication: **filesystem** at +`/home/gamma/workspace/data/tutorials/a_b_factor/` + +--- + +## 7) Exercise 2 — A DAG (key pattern) +Create a new DAG in `cosiflow/dags` path. + +```python +from airflow.operators.python import ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +with DAG( + dag_id="a_dag", + default_args=default_args, + description="A: make 32×32 text matrix, factorize via SVD into A,B and save them", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "cosipy", "producer", "linalg"], +) as dag: + + a_factorize = ExternalPythonOperator( + task_id="a_factorize_text_matrix", + python=EXTERNAL_PYTHON, + python_callable=_a_make_factors, # defined in the DAG file + op_kwargs={ + "base_dir": "/home/gamma/workspace/data/tutorials/a_b_factor", + "text": "DAGs\n ARE\nCOOL!", + "size": [48, 48], + "font_size": 6, + "rank": 12, + }, + ) +``` +**Rule**: pass everything via **`op_kwargs`** to avoid global‑scope issues. + +Now copy paste the code contained in `cosiflow/tutorials/functions/a_standalone.py` + +--- + +## 8) Exercise 2 — B DAG (sensor + external python) +Create a new DAG in `cosiflow/dags` path. + +```python +from airflow.sensors.python import PythonSensor +from airflow.operators.python import ExternalPythonOperator + +with DAG( + dag_id="b_dag", + default_args=default_args, + description="B: wait for L,R factors, reconstruct L@R and re-plot the original matrix", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "consumer", "linalg"], +) as dag: + + wait_for_factors = PythonSensor( + task_id="wait_for_factors_pickle", + python_callable=_file_exists, + op_kwargs={"pkl_path": "/.../factors.pkl"}, + poke_interval=10, timeout=3600, + ) + + b_reconstruct = ExternalPythonOperator( + task_id="b_reconstruct_and_plot", + python=EXTERNAL_PYTHON, + python_callable=_b_reconstruct_and_plot, + op_kwargs={ + "base_dir": "/.../a_b_factor", + "pkl_path": "/.../a_b_factor/factors.pkl", + "bin_thr": 0.5, + }, + ) + + wait_for_factors >> b_reconstruct +``` + +Now copy paste the code contained in `cosiflow/tutorials/functions/b_standalone.py` + +--- + +## 9) Demo flow +1. Trigger **B** first → observe the Sensor waiting +2. Trigger **A** → produces factors & plots +3. B continues → produces reconstructions +4. Show files in the shared folder + +Cleanup (optional): +``` +rm -f /home/gamma/workspace/data/tutorials/a_b_factor/* +``` diff --git a/tutorials/dags/dag_a.py b/tutorials/dags/dag_a.py new file mode 100644 index 0000000..be72913 --- /dev/null +++ b/tutorials/dags/dag_a.py @@ -0,0 +1,175 @@ +# a_dag.py +# Airflow 2.x — Alice: build 32x32 binary text matrix, factorize via SVD, save A,B and plots +from datetime import datetime + +from airflow import DAG +from airflow.operators.python import ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +TEXT = "DAGs\n ARE\nCOOL!" +SIZE = [48, 48] # pass lists in op_kwargs (safer JSON-serializable) +FONT_SIZE = 6 +RANK = 12 +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" + +def _a_make_factors(base_dir: str, text: str, size: list, font_size: int, rank: int): + """Run entirely in the external 'cosipy' interpreter. + Robustly measure multiline text size across Pillow versions (no draw.textsize). + """ + from pathlib import Path + import pickle + import numpy as np + import matplotlib + matplotlib.use("Agg") # safe non-interactive backend + import matplotlib.pyplot as plt + from PIL import Image, ImageDraw, ImageFont + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + pkl_path = base / "factors.pkl" + img_L = base / "factor_L.png" + img_R = base / "factor_R.png" + + W, H = int(size[0]), int(size[1]) + + # -- Load a mono font if available, otherwise default fallback + try: + font = ImageFont.truetype("DejaVuSansMono.ttf", font_size) + except Exception: + font = ImageFont.load_default() + + # -- Helper: robust multiline text bounding box across Pillow versions + def measure_multiline(draw: ImageDraw.ImageDraw, txt: str, font: ImageFont.ImageFont): + """Return (w, h) for multiline text. Tries modern APIs first, falls back gracefully.""" + if hasattr(draw, "multiline_textbbox"): + left, top, right, bottom = draw.multiline_textbbox((0, 0), txt, font=font, align="center") + return (right - left, bottom - top) + if hasattr(draw, "textbbox"): + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + if line == "": + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + widths.append(0) + heights.append(lh) + else: + l, t, r, b = draw.textbbox((0, 0), line, font=font) + widths.append(r - l) + heights.append(b - t) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + # Fallback + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + try: + w_line = draw.textlength(line, font=font) + except Exception: + w_line = max(1, int(len(line) * font.size * 0.6)) + widths.append(int(w_line)) + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + heights.append(lh) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + + # -- 1) Render text -> binary matrix (0 white, 1 black) + img = Image.new("L", (W, H), color=255) + draw = ImageDraw.Draw(img) + + w, h = measure_multiline(draw, text, font) + x = (W - w) // 2 + y = (H - h) // 2 + + if hasattr(draw, "multiline_text"): + draw.multiline_text((x, y), text, fill=0, font=font, align="center") + else: + lines = text.splitlines() or [text] + cur_y = y + for line in lines: + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + draw.text((x, cur_y), line, fill=0, font=font) + cur_y += lh + + arr = np.array(img) + X = (arr < 128).astype(float) # binary 0/1 as float + + # -- 2) SVD factorization: X ≈ (U_k sqrt(S)) (sqrt(S) V_k^T) + U, s, Vt = np.linalg.svd(X, full_matrices=False) + k = max(1, min(int(rank), len(s))) + Uk = U[:, :k] + Sk = np.diag(s[:k]) + Vk = Vt[:k, :] + Ssqrt = np.sqrt(Sk) + L = Uk @ Ssqrt + R = Ssqrt @ Vk + + # -- 3) Persist factors + with open(pkl_path, "wb") as f: + pickle.dump( + { + "L": L.astype("float32"), + "R": R.astype("float32"), + "meta": {"rank": int(k), "size": [W, H], "text": text}, + }, + f, + ) + + # -- 4) Visualize L and R (not binary) + def _plot_matrix(M, out_path, title): + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title(title) + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(out_path) + plt.close() + + _plot_matrix(L, img_L, f"L factor ({W}×{k})") + _plot_matrix(R, img_R, f"R factor ({k}×{H})") + + + + +default_args = { + "owner": "gamma", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, +} + + +with DAG( + dag_id="a_dag", + default_args=default_args, + description="A: make 32×32 text matrix, factorize via SVD into A,B and save them", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "cosipy", "producer", "linalg"], +) as dag: + + a_factorize = ExternalPythonOperator( + task_id="a_factorize_text_matrix", + python=EXTERNAL_PYTHON, # interpreter in cosipy env + python_callable=_a_make_factors, # callable executed in external PY + op_kwargs={ + "base_dir": BASE_DIR, + "text": TEXT, + "size": SIZE, + "font_size": FONT_SIZE, + "rank": RANK, + }, + ) diff --git a/tutorials/dags/dag_b.py b/tutorials/dags/dag_b.py new file mode 100644 index 0000000..2b9581e --- /dev/null +++ b/tutorials/dags/dag_b.py @@ -0,0 +1,98 @@ +# b_dag.py +# Airflow 2.x — Bob: wait for factors.pkl, reconstruct L@R, plot float and binary images +from datetime import datetime + +from airflow import DAG +from airflow.sensors.python import PythonSensor +from airflow.operators.python import ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" +PKL_PATH = f"{BASE_DIR}/factors.pkl" +BIN_THR = 0.5 # threshold to binarize reconstruction + +def _file_exists(pkl_path: str) -> bool: + """Sensor callable: returns True when the pickle file exists.""" + import os + return os.path.exists(pkl_path) + +def _b_reconstruct_and_plot(base_dir: str, pkl_path: str, bin_thr: float): + """Run in external interpreter. Load L,R -> M=L@R; save float & binarized reconstructions.""" + from pathlib import Path + import pickle + import numpy as np + import matplotlib.pyplot as plt + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + img_rec_float = base / "reconstruction_float.png" + img_rec_bin = base / "reconstruction_binary.png" + + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + + L = np.asarray(payload["L"], dtype=float) # (32×k) + R = np.asarray(payload["R"], dtype=float) # (k×32) + + # 1) Reconstruct + M = L @ R + + # 2) Save float heatmap + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title("Reconstruction (float)") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_float) + plt.close() + + # 3) Save binarized heatmap (to match Alice's binary look) + M_bin = (M >= bin_thr).astype(int) + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M_bin, cmap="gray_r", interpolation="nearest") + plt.title(f"Reconstruction (binary, thr={bin_thr})") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_bin) + plt.close() + +default_args = { + "owner": "gamma", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, +} + +with DAG( + dag_id="b_dag", + default_args=default_args, + description="B: wait for L,R factors, reconstruct L@R and re-plot the original matrix", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "consumer", "linalg"], +) as dag: + + wait_for_factors = PythonSensor( + task_id="wait_for_factors_pickle", + python_callable=_file_exists, + op_kwargs={"pkl_path": PKL_PATH}, + poke_interval=10, # seconds + timeout=60 * 60, # 1 hour + mode="poke", + ) + + b_reconstruct = ExternalPythonOperator( + task_id="b_reconstruct_and_plot", + python=EXTERNAL_PYTHON, + python_callable=_b_reconstruct_and_plot, + op_kwargs={ + "base_dir": BASE_DIR, + "pkl_path": PKL_PATH, + "bin_thr": BIN_THR, + }, + ) + + wait_for_factors >> b_reconstruct \ No newline at end of file diff --git a/tutorials/dags/dag_helloworld.py b/tutorials/dags/dag_helloworld.py new file mode 100644 index 0000000..8ffed2b --- /dev/null +++ b/tutorials/dags/dag_helloworld.py @@ -0,0 +1,49 @@ +# hello_world_dag.py +# Airflow 2.x +from datetime import datetime +from pathlib import Path + +from airflow import DAG +from airflow.operators.bash import BashOperator +from airflow.operators.python import PythonOperator + +BASE_DIR = Path("/home/gamma/workspace/data/tutorials") +RESULT_FILE = BASE_DIR / "result.txt" + +def write_hello(): + """Append 'Hello Wolrd!' into result.txt. + Note: the folder/file is guaranteed to exist from the Bash task.""" + with open(RESULT_FILE, "a", encoding="utf-8") as f: + f.write("Hello Wolrd!\n") # intentionally keeping the requested typo + +# Default arguments for the DAG +default_args = { + 'owner': 'gamma', +} + +with DAG( + dag_id="hello_world_dag", + default_args=default_args, + description="Minimal example: Bash touch + Python writes text", + start_date=datetime(2025, 1, 1), + schedule_interval=None, # run on-demand + catchup=False, + tags=["cosifest", "handson", "tutorials"], +) as dag: + + make_file = BashOperator( + task_id="make_folder_and_file", + bash_command=( + f"mkdir -p {BASE_DIR} && " + f"touch {RESULT_FILE}" + ), + # Good practice: fail if any piece fails + env={}, + ) + + write_text = PythonOperator( + task_id="write_text", + python_callable=write_hello, + ) + + make_file >> write_text diff --git a/tutorials/functions/a_standalone.py b/tutorials/functions/a_standalone.py new file mode 100644 index 0000000..7fb4916 --- /dev/null +++ b/tutorials/functions/a_standalone.py @@ -0,0 +1,139 @@ +# a_standalone.py +# Airflow 2.x — Alice: build 32x32 binary text matrix, factorize via SVD, save L,R and plots + +# =========[ ALICE: CONFIG ]========= +# External Python interpreter (your cosipy conda env) +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +TEXT = "DAGs\n ARE\nCOOL!" +SIZE = [48, 48] # pass lists in op_kwargs (safer JSON-serializable) +FONT_SIZE = 6 +RANK = 12 +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" + +# =========[ ALICE: TASK CALLABLES ]========= +def _a_make_factors(base_dir: str, text: str, size: list, font_size: int, rank: int): + """Run entirely in the external 'cosipy' interpreter. + Robustly measure multiline text size across Pillow versions (no draw.textsize). + """ + from pathlib import Path + import pickle + import numpy as np + import matplotlib + matplotlib.use("Agg") # safe non-interactive backend + import matplotlib.pyplot as plt + from PIL import Image, ImageDraw, ImageFont + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + pkl_path = base / "factors.pkl" + img_L = base / "factor_L.png" + img_R = base / "factor_R.png" + + W, H = int(size[0]), int(size[1]) + + # -- Load a mono font if available, otherwise default fallback + try: + font = ImageFont.truetype("DejaVuSansMono.ttf", font_size) + except Exception: + font = ImageFont.load_default() + + # -- Helper: robust multiline text bounding box across Pillow versions + def measure_multiline(draw: ImageDraw.ImageDraw, txt: str, font: ImageFont.ImageFont): + """Return (w, h) for multiline text. Tries modern APIs first, falls back gracefully.""" + if hasattr(draw, "multiline_textbbox"): + left, top, right, bottom = draw.multiline_textbbox((0, 0), txt, font=font, align="center") + return (right - left, bottom - top) + if hasattr(draw, "textbbox"): + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + if line == "": + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + widths.append(0) + heights.append(lh) + else: + l, t, r, b = draw.textbbox((0, 0), line, font=font) + widths.append(r - l) + heights.append(b - t) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + # Fallback + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + try: + w_line = draw.textlength(line, font=font) + except Exception: + w_line = max(1, int(len(line) * font.size * 0.6)) + widths.append(int(w_line)) + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + heights.append(lh) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + + # -- 1) Render text -> binary matrix (0 white, 1 black) + img = Image.new("L", (W, H), color=255) + draw = ImageDraw.Draw(img) + + w, h = measure_multiline(draw, text, font) + x = (W - w) // 2 + y = (H - h) // 2 + + if hasattr(draw, "multiline_text"): + draw.multiline_text((x, y), text, fill=0, font=font, align="center") + else: + lines = text.splitlines() or [text] + cur_y = y + for line in lines: + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + draw.text((x, cur_y), line, fill=0, font=font) + cur_y += lh + + arr = np.array(img) + X = (arr < 128).astype(float) # binary 0/1 as float + + # -- 2) SVD factorization: X ≈ (U_k sqrt(S)) (sqrt(S) V_k^T) + U, s, Vt = np.linalg.svd(X, full_matrices=False) + k = max(1, min(int(rank), len(s))) + Uk = U[:, :k] + Sk = np.diag(s[:k]) + Vk = Vt[:k, :] + Ssqrt = np.sqrt(Sk) + L = Uk @ Ssqrt + R = Ssqrt @ Vk + + # -- 3) Persist factors + with open(pkl_path, "wb") as f: + pickle.dump( + { + "L": L.astype("float32"), + "R": R.astype("float32"), + "meta": {"rank": int(k), "size": [W, H], "text": text}, + }, + f, + ) + + # -- 4) Visualize L and R (not binary) + def _plot_matrix(M, out_path, title): + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title(title) + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(out_path) + plt.close() + + _plot_matrix(L, img_L, f"L factor ({W}×{k})") + _plot_matrix(R, img_R, f"R factor ({k}×{H})") diff --git a/tutorials/functions/b_standalone.py b/tutorials/functions/b_standalone.py new file mode 100644 index 0000000..74e4678 --- /dev/null +++ b/tutorials/functions/b_standalone.py @@ -0,0 +1,58 @@ +# b_standalone.py +# Airflow 2.x — Bob: wait for factors.pkl, reconstruct L@R, plot float and binary images + +# =========[ BOB: RECONSTRUCT AND PLOT ]========= +# External Python interpreter (your cosipy conda env) +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" +PKL_PATH = f"{BASE_DIR}/factors.pkl" +BIN_THR = 0.5 # threshold to binarize reconstruction + +# =========[ BOB: SENSOR ]========= +def _file_exists(pkl_path: str) -> bool: + """Sensor callable: returns True when the pickle file exists.""" + import os + return os.path.exists(pkl_path) + +# =========[ BOB: RECONSTRUCT AND PLOT ]========= +def _b_reconstruct_and_plot(base_dir: str, pkl_path: str, bin_thr: float): + """Run in external interpreter. Load L,R -> M=L@R; save float & binarized reconstructions.""" + from pathlib import Path + import pickle + import numpy as np + import matplotlib.pyplot as plt + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + img_rec_float = base / "reconstruction_float.png" + img_rec_bin = base / "reconstruction_binary.png" + + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + + L = np.asarray(payload["L"], dtype=float) # (32×k) + R = np.asarray(payload["R"], dtype=float) # (k×32) + + # 1) Reconstruct + M = L @ R + + # 2) Save float heatmap + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title("Reconstruction (float)") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_float) + plt.close() + + # 3) Save binarized heatmap (to match Alice's binary look) + M_bin = (M >= bin_thr).astype(int) + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M_bin, cmap="gray_r", interpolation="nearest") + plt.title(f"Reconstruction (binary, thr={bin_thr})") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_bin) + plt.close() diff --git a/tutorials/test/example_paths_usage.py b/tutorials/test/example_paths_usage.py new file mode 100644 index 0000000..00ca012 --- /dev/null +++ b/tutorials/test/example_paths_usage.py @@ -0,0 +1,15 @@ +from cosiflow.paths import build_path, file_path, first_match, Domain + +# Costruisci un path canonico per un file di output +p = file_path(Domain.trigger, year=2027, month=7, entity_id="trg_001", + leaf="plots", filename="tsmap_2deg.png") +print(p) +# -> cosi/data/trigger/2027_07/trg_001/plots/tsmap_2deg.png + +# Trova il primo file che combacia +found = first_match(Domain.obs, year=2027, month=8, + entity_id="obs_123", leaf="compton", pattern="*.fits") + +# Parsing inverso +info = parse_path(p) +# -> PathInfo(domain='trigger', year=2027, month=7, entity_id='trg_001', leaf='plots', remainder=('tsmap_2deg.png',))