HOTFIX: Matomo config dump postgres

JeanLouisLamezec · JeanLouisLamezec · commit 60e5aa471cae · 2025-11-28T14:25:23.000+01:00
diff --git a/dags/data_utils/doc_helpscout.py b/dags/data_utils/doc_helpscout.py
@@ -4,7 +4,7 @@
 from airflow.models import Variable
 from requests.auth import HTTPBasicAuth
 
-from data_utils.grist.grist_helper import get_grist_api
+from .grist.grist_helper import get_grist_api
 
 connection_helpscout = BaseHook.get_connection("helpscout")
 assert connection_helpscout.login is not None
diff --git a/dags/data_utils/matomo_pull/matomo_helper.py b/dags/data_utils/matomo_pull/matomo_helper.py
@@ -4,11 +4,7 @@
 
 from .matomo_campaign_helper import process_dataframe_for_campaign
 from .matomo_request_config import matomo_requests_config
-from ..postgres_helper import (
-    get_postgres_connection,
-    clean_data_in_postgres,
-    dump_data_to_postgres,
-)
+from .matomo_postgres_helper import get_postgres_connection, clean_data_in_postgres, dump_data_to_postgres
 from .matomo_url import get_matomo_base_url, construct_url
 import logging
 
@@ -19,13 +15,13 @@
 
 def parse_range_data(raw_data):
     for entry in raw_data:
-        if entry.get("subtable"):
-            for sub_entry in entry["subtable"]:
-                sub_entry["sub_type"] = entry["label"]
+        if entry.get('subtable'):
+            for sub_entry in entry['subtable']:
+                sub_entry['sub_type'] = entry["label"]
                 raw_data.append(sub_entry)
-            entry.pop("subtable")
-        if entry.get("goals"):
-            entry.pop("goals")
+            entry.pop('subtable')
+        if entry.get('goals'):
+            entry.pop('goals')
     return raw_data
 
 
@@ -34,11 +30,11 @@ def fetch_data_for_day(base_url, report_name, config, day):
     """Fetches data from Matomo for a specific day and returns it as a DataFrame."""
     url = construct_url(base_url, config, day)
     try:
-        response = http.request("GET", url)
-        raw_data = json.loads(response.data.decode("utf-8"))
+        response = http.request('GET', url)
+        raw_data = json.loads(response.data.decode('utf-8'))
 
         # Check if the response contains errors
-        if isinstance(raw_data, dict) and raw_data.get("result") == "error":
+        if isinstance(raw_data, dict) and raw_data.get('result') == 'error':
             error_message = f"Error fetching data for {report_name} on {day}: {raw_data.get('message')}"
             raise Exception(error_message)
 
@@ -49,58 +45,46 @@ def fetch_data_for_day(base_url, report_name, config, day):
         elif isinstance(raw_data, dict):
             data = pd.json_normalize(raw_data)
         else:
-            print(
-                f"Unexpected data format for {report_name} on {day}: {type(raw_data)}"
-            )
+            print(f"Unexpected data format for {report_name} on {day}: {type(raw_data)}")
             return pd.DataFrame()
         # Add the date field to each row
-        data["date"] = pd.to_datetime(day)
+        data['date'] = pd.to_datetime(day)
         data_processed = process_dataframe_for_campaign(data)
         return data_processed
 
     except Exception as e:
         error_message = f"Error fetching data for {report_name} on {day}: {str(e)}"
         raise Exception(error_message)
 
-
+    # Fetch data from Matomo for each day in the date range and merge into a single DataFrame
 def fetch_data_from_matomo(base_url, report_name, config, start_date, end_date):
     """Fetches data from Matomo for each day in the specified range and merges it into a single DataFrame."""
-    date_range = (
-        pd.date_range(start=start_date, end=end_date).strftime("%Y-%m-%d").tolist()
-    )
-    all_data = [
-        fetch_data_for_day(base_url, report_name, config, day) for day in date_range
-    ]
+    date_range = pd.date_range(start=start_date, end=end_date).strftime('%Y-%m-%d').tolist()
+    all_data = [fetch_data_for_day(base_url, report_name, config, day) for day in date_range]
 
     # Combine all non-empty DataFrames into a single DataFrame
     valid_data = [df for df in all_data if not df.empty]
     if valid_data:
         return pd.concat(valid_data, ignore_index=True)
     else:
-        logger.warning(
-            f"No data fetched for report '{report_name}' between {start_date} and {end_date}."
-        )
+        logger.warning(f"No data fetched for report '{report_name}' between {start_date} and {end_date}.")
         return pd.DataFrame()
 
-
+    # Main function to fetch and dump data
 def fetch_and_dump_data(matomo_site_id, database, day):
-    """
-    Main function to fetch and dump data
-    """
-    start_date = (pd.to_datetime(day) - pd.Timedelta(days=1)).strftime("%Y-%m-%d")
+
+    start_date = (pd.to_datetime(day) - pd.Timedelta(days=1)).strftime('%Y-%m-%d')
     end_date = start_date
     base_url = get_matomo_base_url(matomo_site_id)
-    connection = get_postgres_connection("matomo_postgres", database)
+    connection = get_postgres_connection(database)
 
     if not connection:
         error_message = "Cannot proceed without database connection."
         raise Exception(error_message)
 
     for report_name, config in matomo_requests_config.items():
         print(f"Fetching data for {report_name}...")
-        data = fetch_data_from_matomo(
-            base_url, report_name, config, start_date, end_date
-        )
+        data = fetch_data_from_matomo(base_url, report_name, config, start_date, end_date)
 
         if data is not None and not data.empty:
             # Clean existing data in the table before dumping new data
@@ -110,4 +94,4 @@ def fetch_and_dump_data(matomo_site_id, database, day):
         else:
             print(f"No data fetched for {report_name}, skipping clean and dump.")
 
-    connection.close()
+    connection.close()
diff --git a/dags/data_utils/matomo_pull/matomo_postgres_helper.py b/dags/data_utils/matomo_pull/matomo_postgres_helper.py
@@ -0,0 +1,97 @@
+from sqlalchemy import create_engine, text
+from airflow.hooks.base import BaseHook
+from sqlalchemy import inspect
+
+
+def get_postgres_connection(database):
+    """Extracts PostgreSQL connection details from Airflow and establishes a connection."""
+    try:
+        # Retrieve the connection object using Airflow's BaseHook
+        connection = BaseHook.get_connection('matomo_postgres')
+
+        # Extract connection details
+        user = connection.login
+        password = connection.password
+        host = connection.host
+        port = connection.port
+
+        # Create the SQLAlchemy engine
+        engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")
+        conn = engine.connect()
+        print("Successfully connected to the PostgreSQL database using Airflow connection.")
+        return conn
+
+    except Exception as e:
+        print(f"Failed to connect to PostgreSQL using Airflow connection: {e}")
+        raise  # Raise exception to ensure the DAG fails if the connection cannot be established
+
+
+# Clean data in PostgreSQL within the date range
+def clean_data_in_postgres(connection, table_name, start_date, end_date):
+    """Deletes rows in the table where the 'date' is between the start_date and end_date."""
+    try:
+        delete_query = text(
+            f"DELETE FROM {table_name} WHERE date BETWEEN :start_date AND :end_date"
+        )
+        connection.execute(delete_query, {'start_date': start_date, 'end_date': end_date})
+        print(f"Cleaned data in {table_name} between {start_date} and {end_date}.")
+    except Exception as e:
+        print(f"Failed to clean data in {table_name}: {e}")
+
+
+# Dump DataFrame to PostgreSQL table
+def dump_data_to_postgres(connection, data, table_name):
+    """
+    Dumps the DataFrame into the specified PostgreSQL table, creating missing columns if necessary.
+
+    Parameters:
+    connection: SQLAlchemy engine or connection object
+        The connection to the PostgreSQL database.
+    data: pandas.DataFrame
+        The DataFrame containing the data to be dumped.
+    table_name: str
+        The name of the PostgreSQL table to insert the data into.
+    """
+
+    # Convert DataFrame columns to lowercase
+    data.columns = [col.lower() for col in data.columns]
+
+    try:
+        # Inspect the existing columns in the table
+        inspector = inspect(connection)
+        existing_columns = []
+        if table_name in inspector.get_table_names():
+            existing_columns = [col['name'] for col in inspector.get_columns(table_name)]
+
+
+        # Identify missing columns
+        missing_columns = set(data.columns) - set(existing_columns)
+
+        # Add missing columns to the table
+        for column in missing_columns:
+            dtype = data[column].dtype
+            if dtype == 'int64':
+                sql_type = 'INTEGER'
+            elif dtype == 'float64':
+                sql_type = 'FLOAT'
+            elif dtype == 'bool':
+                sql_type = 'BOOLEAN'
+            elif dtype == 'datetime64[ns]':
+                sql_type = 'TIMESTAMP'
+            else:
+                sql_type = 'TEXT'
+
+            alter_query = f'ALTER TABLE {table_name} ADD COLUMN {column} {sql_type};'
+            try:
+                connection.execute(alter_query)
+                print(f"Column added: {column} ({sql_type})")
+            except Exception as e:
+                print(f"Error adding column {column}: {e}")
+
+        # Insert the data into the table
+        data.to_sql(table_name, connection, if_exists='append', index=False)
+        print(f"Data for {table_name} dumped successfully into the table.")
+
+    except Exception as e:
+        # Log the error if the data dump fails
+        print(f"Failed to dump data into {table_name}: {e}")