diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index ff0425b..3dcb737 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -5,6 +5,7 @@ on: push: branches: - main + - helm-chart jobs: build-and-push-api: @@ -40,71 +41,3 @@ jobs: push: true tags: ${{ steps.meta-api.outputs.tags }} labels: ${{ steps.meta-api.outputs.labels }} - - build-and-push-duckdb-init: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Log in to the Container registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for DuckDB Init - id: meta-duckdb - uses: docker/metadata-action@v5 - with: - images: ghcr.io/sunu/geodini/duckdb-init - tags: | - type=sha - type=raw,value=latest - - - name: Build and push DuckDB Init image - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile.duckdb - push: true - tags: ${{ steps.meta-duckdb.outputs.tags }} - labels: ${{ steps.meta-duckdb.outputs.labels }} - - build-and-push-frontend: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Log in to the Container registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for Frontend - id: meta-frontend - uses: docker/metadata-action@v5 - with: - images: ghcr.io/sunu/geodini/frontend - tags: | - type=sha - type=raw,value=latest - - - name: Build and push Frontend image - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile.frontend - push: true - tags: ${{ steps.meta-frontend.outputs.tags }} - labels: ${{ steps.meta-frontend.outputs.labels }} diff --git a/.github/workflows/helm-publish.yml b/.github/workflows/helm-publish.yml new file mode 100644 index 0000000..8c7bbb1 --- /dev/null +++ b/.github/workflows/helm-publish.yml @@ -0,0 +1,45 @@ +name: Publish Helm Chart + +on: + push: + branches: + - main + - helm-chart + paths: + - 'helm/**' + release: + types: [published] + +jobs: + publish-helm-chart: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: '3.13.0' + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Package Helm chart + run: | + helm package helm/geodini --destination ./chart-packages + + - name: Push Helm chart to GHCR + run: | + helm push ./chart-packages/*.tgz oci://ghcr.io/sunu/geodini/helm-chart + + - name: List packaged charts + run: | + ls -la ./chart-packages/ diff --git a/Dockerfile.duckdb b/Dockerfile.duckdb deleted file mode 100644 index aa1e8f1..0000000 --- a/Dockerfile.duckdb +++ /dev/null @@ -1,12 +0,0 @@ -FROM python:3.11-slim - -# Install DuckDB CLI -RUN apt-get update && apt-get install -y wget unzip && \ - wget -O /tmp/duckdb https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip && \ - unzip /tmp/duckdb -d /usr/local/bin/ && \ - chmod +x /usr/local/bin/duckdb && \ - rm -rf /tmp/duckdb && \ - apt-get remove --purge -y wget unzip && apt-get autoremove -y && apt-get clean; - -# copy the scripts -COPY scripts/ /scripts/ \ No newline at end of file diff --git a/README.md b/README.md index 8a88581..80be9db 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,83 @@ # Geodini A natural language geocoding API. + +## Helm Chart Installation + +This application can be deployed using the provided Helm chart located in the `geodini-chart` directory. + +### Prerequisites + +* Kubernetes cluster (e.g., Minikube, Kind, or a cloud provider's K8s service) +* Helm v3 installed +* `kubectl` configured to connect to your cluster + +### Installation Steps + +1. **Navigate to the chart directory:** + ```bash + cd geodini-chart + ``` + +2. **Review and customize values (Optional):** + Before installing, you might want to customize the deployment by overriding default values. Create a `my-values.yaml` file or inspect `values.yaml` for available options. + Key values you might want to override: + * `postgres.password`: **It is highly recommended to change the default PostgreSQL password.** + * `ingress.enabled`: Set to `true` if you have an Ingress controller and want to expose the application via Ingress. + * `ingress.hosts`: Configure your desired hostname(s). + * `ingress.tls`: Configure TLS secrets if using HTTPS. + * `appDataPersistence.size` and `postgres.persistence.size`: Adjust storage sizes as needed. + * Image tags for `api`, `frontend`, and `api.initContainer` if you want to use specific versions. + +3. **Install the chart:** + To install the chart with the release name `geodini`: + ```bash + helm install geodini . + ``` + If you have a custom values file: + ```bash + helm install geodini . -f my-values.yaml + ``` + To install into a specific namespace: + ```bash + helm install geodini . --namespace geodini-ns --create-namespace + ``` + +4. **Check deployment status:** + ```bash + kubectl get pods -n + kubectl get svc -n + ``` + Wait for all pods to be in the `Running` state. The `NOTES.txt` output from the Helm install command will also provide useful information on how to access the application. + +### Accessing the Application + +* **Via Port-Forward (if Ingress is not enabled):** + The `NOTES.txt` from the Helm installation will provide `kubectl port-forward` commands. Typically: + ```bash + # Forward Frontend + kubectl port-forward svc/geodini-frontend 8080:80 # Access at http://localhost:8080 + # Forward API (if direct access needed) + kubectl port-forward svc/geodini-api 9000:9000 + ``` +* **Via Ingress (if enabled):** + Access the application via the host and paths configured in your `values.yaml` (e.g., `http://chart-example.local/` or `https://your.domain.com/`). + +### Upgrading the Chart + +To upgrade an existing release: +```bash +helm upgrade geodini . -f my-values.yaml # Or without -f if no custom values +``` + +### Uninstalling the Chart + +To uninstall/delete the `geodini` release: +```bash +helm uninstall geodini +``` +This will remove all Kubernetes components associated with the chart. PersistentVolumeClaims (PVCs) might need to be deleted manually if you want to remove the persisted data: +```bash +kubectl delete pvc geodini-app-data geodini-postgres-data +``` +(Adjust PVC names based on your release name). diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml deleted file mode 100644 index cc264b2..0000000 --- a/docker-compose-prod.yml +++ /dev/null @@ -1,53 +0,0 @@ -services: - api-prod: - # build: - # context: . - # dockerfile: Dockerfile.api - image: geodini-api:prod - ports: - - "19000:9000" - environment: - - PORT=9000 - # Add other environment variables as needed - env_file: - - .env - volumes: - # - ./geodini:/app/geodini - - ./data:/app/data - networks: - - caddy - - default - labels: - caddy: api.geodini.labs.sunu.in - caddy.reverse_proxy: "{{upstreams 9000}}" - - frontend-prod: - # build: - # context: . - # dockerfile: Dockerfile.frontend - image: geodini-frontend:prod - ports: - - "10080:80" - networks: - - caddy - - default - labels: - caddy: geodini.labs.sunu.in - caddy.reverse_proxy: "{{upstreams 80}}" - depends_on: - - api-prod - - # database: - # image: postgis/postgis:17-3.4 - # ports: - # - "15432:5432" - # environment: - # - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} - # networks: - # - default - # volumes: - # - ./data/postgres:/var/lib/postgresql/data - -networks: - caddy: - external: true diff --git a/docker-compose.yml b/docker-compose.yml index 0fa71f6..50f5575 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,20 +1,19 @@ services: - init-data: + init-ingest-data: build: context: . - dockerfile: Dockerfile.duckdb + dockerfile: Dockerfile.api volumes: - - ./scripts:/scripts - - ./data:/data - command: - [ - "/bin/bash", - "-c", - "chmod +x /scripts/init-data.sh && /scripts/init-data.sh", - ] - - # This service is for initialization, so no ports are exposed. - # It will run, and upon successful completion, other services depending on it will start. + - ./geodini:/app/geodini + - ./data:/app/data + environment: + - POSTGRES_HOST=database + - POSTGRES_PORT=5432 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} + - POSTGRES_DB=postgres + - DATA_PATH=/app/data + command: ["python", "geodini/ingest.py"] api: build: @@ -29,13 +28,13 @@ services: - POSTGRES_USER=postgres - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} - POSTGRES_DB=postgres - - DATA_PATH=/app/data/overture-unified.duckdb - # Add other environment variables as needed + - DATA_PATH=/app/data env_file: - .env volumes: - ./geodini:/app/geodini - ./data:/app/data + command: [ "uvicorn", @@ -49,7 +48,7 @@ services: depends_on: database: condition: service_healthy - init-data: + init-ingest-data: condition: service_completed_successfully mcp: @@ -75,8 +74,6 @@ services: depends_on: database: condition: service_healthy - init-data: - condition: service_completed_successfully frontend: build: diff --git a/geodini/agents/complex_agents.py b/geodini/agents/complex_agents.py index 1eb98fe..0b5215b 100644 --- a/geodini/agents/complex_agents.py +++ b/geodini/agents/complex_agents.py @@ -72,7 +72,7 @@ class RoutingContext: routing_agent = Agent( - "openai:gpt-4o-mini", + "openai:gpt-4.1-mini", output_type=RoutingResult, deps_type=RoutingContext, system_prompt=""" @@ -94,7 +94,7 @@ class ComplexQueryContext: complex_geocode_query_agent = Agent( - "openai:gpt-4o-mini", + "openai:gpt-4.1-mini", output_type=ComplexGeocodeResult, deps_type=ComplexQueryContext, system_prompt=""" diff --git a/geodini/agents/simple_geocoder_agent.py b/geodini/agents/simple_geocoder_agent.py index 52b3c2c..e757ce1 100644 --- a/geodini/agents/simple_geocoder_agent.py +++ b/geodini/agents/simple_geocoder_agent.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass from pprint import pprint +from concurrent.futures import ThreadPoolExecutor import pluggy from pydantic_ai import Agent @@ -39,7 +40,7 @@ class RerankingResult: rerank_agent = Agent( # 4o-mini is smarter than 3.5-turbo. And does better in edge cases. - "openai:gpt-4o-mini", + "openai:gpt-4.1-mini", output_type=RerankingResult, deps_type=RerankingContext, system_prompt=""" @@ -72,7 +73,7 @@ class RephrasedQuery: rephrase_agent = Agent( - "openai:gpt-3.5-turbo", + "openai:gpt-4.1-mini", output_type=RephrasedQuery, deps_type=SearchContext, system_prompt=""" @@ -81,7 +82,7 @@ class RephrasedQuery: Extract: 1. The main search term (place name) - for example, "the city of San Francisco" should return "San Francisco", "New York City" should return "New York", "Paris, TX" should return "Paris", "Sahara Desert" should return "Sahara", "The Himalayan mountain range" should return "Himalaya", "The Amazon rainforest" should return "Amazon". - If the query is a shortened name, return the full name - for example, "usa" should return "United States" and so on. + If the query is a shortened name, return the full name - for example, "usa" or "The US" should return "United States" and so on. 2. Country code (ISO 2-letter code) if a specific country is mentioned 3. Whether an exact match is requested (e.g., "exactly", "precisely") @@ -108,9 +109,14 @@ async def search_places(query: str) -> list[Place]: geocoding_start_time = time.time() results = [] for geocoder_group in geocoders: - for geocoder in geocoder_group: - temp_results = geocoder(rephrased_query.output.query, limit=20) - results.extend(temp_results) + # Execute in ThreadPoolExecutor to avoid blocking the main thread + with ThreadPoolExecutor() as executor: + futures = [ + executor.submit(geocoder, rephrased_query.output.query, limit=50) + for geocoder in geocoder_group + ] + for future in futures: + results.extend(future.result()) # results = geocoder.geocode(rephrased_query.output.query, limit=20) # pprint(results) geocoding_time = time.time() - geocoding_start_time diff --git a/geodini/agents/utils/__init__.py b/geodini/agents/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/geodini/agents/utils/duckdb_exec.py b/geodini/agents/utils/duckdb_exec.py deleted file mode 100644 index 2f21fc9..0000000 --- a/geodini/agents/utils/duckdb_exec.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -from typing import Any - -import duckdb - - -def duckdb_sanbox(geometries: dict[str, Any], query: str) -> dict[str, Any]: - rows = [(name, json.dumps(geom)) for name, geom in geometries.items()] - - con = duckdb.connect(":memory:") - # con = duckdb.connect("/app/data/test.db") - con.execute("INSTALL spatial; LOAD spatial;") - # Create in-memory table - con.execute("CREATE TABLE place (name TEXT, geojson TEXT);") - con.executemany("INSERT INTO place VALUES (?, ?);", rows) - result = con.execute(query).fetchone() - # print(result) - con.close() - return json.loads(result[0]) diff --git a/geodini/agents/utils/geocoder.py b/geodini/agents/utils/geocoder.py index c1f1dce..3e67a3a 100644 --- a/geodini/agents/utils/geocoder.py +++ b/geodini/agents/utils/geocoder.py @@ -2,135 +2,150 @@ import os from pprint import pprint from typing import Any +import time -import duckdb - -# get data path from current file -DATA_PATH = os.environ.get( - "DATA_PATH", - os.path.join( - os.path.dirname(__file__), "..", "..", "..", "data", "overture-unified.duckdb" - ), -) - - -SUBTYPES = { - "DIVISION": [ - "country", - "dependency", - "region", - "county", - "localadmin", - "locality", - "macrohood", - "neighborhood", - "microhood", - ], - "LAND": [ - "sand", - "wetland", - "desert", - ], -} - - -def geocode(query: str, limit: int | None = None) -> list[dict[str, Any]]: - conn = duckdb.connect(DATA_PATH) - conn.execute("INSTALL spatial;") - conn.execute("LOAD spatial;") - name_condition = ( - f"(common_en_name ILIKE '%{query}%' OR primary_name ILIKE '%{query}%')" - ) - sql_query = build_query(name_condition, False, (limit is not None)) - params = [query, f"%{query}%", query, f"%{query}%", query] - if limit is not None: - params.append(limit) - # print(sql_query) - result = conn.execute(sql_query, params).fetch_df() - conn.close() - # convert geometry to geojson - result["geometry"] = result["geometry"].apply( - lambda x: json.loads(x) if x is not None else None - ) - return result.to_dict(orient="records") - - -def build_query(name_condition: str, has_country_filter: bool, has_limit: bool) -> str: - """Build SQL query for searching overture unified data""" - - # only include division results for now - where_clause = "source_type = 'division'" - # geometry should be not null - where_clause += " AND geometry IS NOT NULL" - - where_clause += f" AND {name_condition}" - - # Add country code filter if provided - if has_country_filter: - where_clause += " AND LOWER(country) = LOWER(?)" - - # Simplified query that takes advantage of our views - sql_query = f""" - WITH matched_results AS ( - SELECT - id, - CASE - WHEN LOWER(primary_name) = LOWER(?) OR primary_name ILIKE ? THEN primary_name - ELSE common_en_name - END AS matched_name, - CASE - WHEN LOWER(primary_name) = LOWER(?) OR primary_name ILIKE ? THEN 'primary' - ELSE 'common_en' - END AS name_type, - subtype, - source_type, - hierarchies, - country, - geometry - FROM all_geometries - WHERE {where_clause} - ) - SELECT +from sqlalchemy import create_engine, text +import dotenv + +dotenv.load_dotenv() + + +# PostgreSQL connection settings +def get_postgis_engine(): + """Get PostgreSQL engine for PostGIS geocoding""" + host = os.getenv("POSTGRES_HOST") or "database" + database = os.getenv("POSTGRES_DB") or "postgres" + user = "postgres" + port = os.getenv("POSTGRES_PORT") or 5432 + password = os.getenv("POSTGRES_PASSWORD") + + return create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}") + + +def geocode(query: str, limit: int | None = 50) -> list[dict[str, Any]]: + """ + Geocode using PostgreSQL/PostGIS database with trigram similarity search. + Follows the same signature and return format as the geocode() function. + """ + start_time = time.time() + engine = get_postgis_engine() + + # Ensure pg_trgm extension is available + with engine.begin() as conn: + try: + conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;")) + except Exception as e: + print(f"Warning: Could not enable pg_trgm extension: {e}") + + connect_time = time.time() - start_time + print(f"PostgreSQL connection time: {connect_time:.2f} seconds") + + query_start_time = time.time() + + # Build the PostgreSQL query using trigram similarity + sql_query = build_postgis_query(limit is not None) + + try: + with engine.begin() as conn: + if limit is not None: + result = conn.execute(text(sql_query), {"query": query, "limit": limit}) + else: + result = conn.execute(text(sql_query), {"query": query}) + + rows = result.fetchall() + + # Convert to the same format as the original geocode function + results = [] + for row in rows: + # Parse geometry JSON if it exists + geometry = None + if row.geometry: + try: + geometry = json.loads(row.geometry) + except (json.JSONDecodeError, TypeError): + geometry = None + + results.append( + { + "id": row.id, + "name": row.name, + "name_type": row.name_type, + "subtype": row.subtype, + "source_type": row.source_type, + "hierarchies": ( + json.loads(row.hierarchies) if row.hierarchies else None + ), + "country": row.country, + "similarity": float(row.similarity), + "geometry": geometry, + } + ) + + except Exception as e: + print(f"Error executing PostgreSQL query: {e}") + return [] + + query_time = time.time() - query_start_time + print(f"PostgreSQL query execution time: {query_time:.2f} seconds") + + total_time = time.time() - start_time + print(f"Total query execution time: {total_time:.2f} seconds") + + return results + + +def build_postgis_query(has_limit: bool) -> str: + """Build PostgreSQL query for searching overture unified data using trigram similarity""" + + sql_query = """ + SELECT id, - matched_name AS name, - name_type, + COALESCE(common_en_name, primary_name) as name, + CASE + WHEN COALESCE(SIMILARITY(primary_name, :query), 0) >= + COALESCE(SIMILARITY(common_en_name, :query), 0) + THEN 'primary' + ELSE 'common_en' + END as name_type, subtype, source_type, hierarchies, country, - ST_AsGeoJSON(ST_GeomFromWKB(geometry)) as geometry - FROM matched_results - ORDER BY - CASE WHEN LOWER(matched_name) = LOWER(?) THEN 0 ELSE 1 END, - CASE name_type - WHEN 'primary' THEN 0 - ELSE 1 - END, - CASE subtype - WHEN 'country' THEN 1 - WHEN 'dependency' THEN 2 - WHEN 'region' THEN 3 - WHEN 'county' THEN 4 - WHEN 'localadmin' THEN 5 - WHEN 'locality' THEN 6 - WHEN 'macrohood' THEN 7 - WHEN 'neighborhood' THEN 8 - WHEN 'microhood' THEN 9 - WHEN 'sand' THEN 13 - WHEN 'wetland' THEN 13 - WHEN 'desert' THEN 13 - ELSE 13 - END, - matched_name + GREATEST( + COALESCE(SIMILARITY(primary_name, :query), 0), + COALESCE(SIMILARITY(common_en_name, :query), 0) + ) as similarity, + ST_AsGeoJSON(geometry) as geometry + FROM all_geometries + WHERE + source_type = 'division' + AND geometry IS NOT NULL + AND (primary_name % :query OR common_en_name % :query) + AND GREATEST( + COALESCE(SIMILARITY(primary_name, :query), 0), + COALESCE(SIMILARITY(common_en_name, :query), 0) + ) > 0.3 + ORDER BY similarity DESC """ # Add LIMIT clause only if limit is specified if has_limit: - sql_query += " LIMIT ?" + sql_query += " LIMIT :limit" return sql_query if __name__ == "__main__": - pprint(geocode("new york")) - pprint(geocode("Amazon")) + import time + + # Test PostgreSQL geocoding + print("=== Testing PostgreSQL geocoding ===") + start_time = time.time() + postgis_results = geocode("new york", limit=5) + end_time = time.time() + print(f"PostgreSQL time taken: {end_time - start_time} seconds") + print(f"PostgreSQL results: {len(postgis_results)} found") + + if postgis_results: + print("\nSample PostgreSQL result:") + pprint(postgis_results[0]) diff --git a/geodini/api/api.py b/geodini/api/api.py index e2ac078..29f73a9 100644 --- a/geodini/api/api.py +++ b/geodini/api/api.py @@ -128,8 +128,8 @@ async def health_check(): print(f"Health check failed: {str(e)}") raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}") finally: - # Close the connection - conn.close() + if "conn" in locals() and conn is not None: + conn.close() return {"status": "healthy"} diff --git a/geodini/ingest.py b/geodini/ingest.py new file mode 100644 index 0000000..1d685d2 --- /dev/null +++ b/geodini/ingest.py @@ -0,0 +1,793 @@ +import glob +import json +import logging +import os +import subprocess +import sys + +import dotenv +import geopandas as gpd +import numpy as np +import pandas as pd +import pyarrow.parquet as pq +from shapely import wkb +from sqlalchemy import create_engine, text + + +dotenv.load_dotenv() + +# Setup logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +host = os.getenv("POSTGRES_HOST") or "database" +database = os.getenv("POSTGRES_DB") or "postgres" +user = "postgres" +port = os.getenv("POSTGRES_PORT") or 5432 +password = os.getenv("POSTGRES_PASSWORD") + +# Add FORCE_RECREATE option +FORCE_RECREATE = os.getenv("FORCE_RECREATE", "false").lower() in ("true", "1", "yes") + +engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}") + +DATA_PATH = os.getenv("DATA_PATH") or "/tmp/data" + +# Configuration +BATCH_SIZE = 10000 # Adjust based on your system's memory + + +class NumpyAwareJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) + + +def test_database_connection(): + """Test database connection and setup""" + logger.info("Testing database connection...") + logger.info(f"Connection details: {user}@{host}:{port}/{database}") + + try: + # Test basic connection + with engine.begin() as conn: + result = conn.execute(text("SELECT version();")) + version = result.fetchone()[0] + logger.info(f"Database connection successful: {version}") + + # Test PostGIS extension + try: + result = conn.execute(text("SELECT PostGIS_Version();")) + postgis_version = result.fetchone()[0] + logger.info(f"PostGIS extension available: {postgis_version}") + except Exception as e: + logger.error(f"PostGIS extension not available: {e}") + logger.info("Attempting to enable PostGIS...") + try: + conn.execute(text("CREATE EXTENSION IF NOT EXISTS postgis;")) + conn.execute( + text("CREATE EXTENSION IF NOT EXISTS postgis_topology;") + ) + result = conn.execute(text("SELECT PostGIS_Version();")) + postgis_version = result.fetchone()[0] + logger.info(f"PostGIS extension enabled: {postgis_version}") + except Exception as create_error: + logger.error(f"Failed to enable PostGIS: {create_error}") + raise + + # Test write permissions + conn.execute( + text( + "CREATE TABLE IF NOT EXISTS connection_test (id SERIAL PRIMARY KEY);" + ) + ) + conn.execute(text("DROP TABLE IF EXISTS connection_test;")) + logger.info("Database write permissions confirmed") + + return True + + except Exception as e: + logger.error(f"Database connection failed: {e}") + logger.error("Connection troubleshooting:") + logger.error(f" - Check if database is running on {host}:{port}") + logger.error(f" - Verify password is set correctly") + logger.error( + f" - For outside Docker, try: POSTGRES_HOST=localhost POSTGRES_PORT=15432" + ) + logger.error( + f" - For inside Docker, try: POSTGRES_HOST=database POSTGRES_PORT=5432" + ) + return False + + +def get_common_en_name(names_obj): + """Safely extract the common English name from a names object.""" + if not isinstance(names_obj, dict): + return None + + common = names_obj.get("common") + + # Handle list format: [['lang', 'name'], ['lang2', 'name2'], ...] or [('lang', 'name'), ...] + if isinstance(common, list): + for item in common: + # Handle list of lists: [['en', 'name'], ['ko', 'name'], ...] + if isinstance(item, list) and len(item) >= 2 and item[0] == "en": + return item[1] + # Handle list of tuples: [('en', 'name'), ('ko', 'name'), ...] + elif isinstance(item, tuple) and len(item) >= 2 and item[0] == "en": + return item[1] + + return None + + +def check_and_download_data(): + """Check if data exists in DATA_PATH, if not download from S3. Skip download if tables already exist.""" + logger.info(f"Checking for data in: {DATA_PATH}") + + # Check if tables already exist with data (unless FORCE_RECREATE is set) + if not FORCE_RECREATE: + divisions_exists, divisions_count = check_table_exists_with_data("divisions") + division_areas_exists, division_areas_count = check_table_exists_with_data( + "division_areas" + ) + + if ( + divisions_exists + and division_areas_exists + and divisions_count > 0 + and division_areas_count > 0 + ): + logger.info( + f"Tables already exist with data (divisions: {divisions_count:,}, division_areas: {division_areas_count:,})" + ) + logger.info( + "Skipping data download. Set FORCE_RECREATE=true to force re-download." + ) + return + + # Define the required directories and their S3 sources + data_requirements = { + "divisions": { + "local_path": os.path.join(DATA_PATH, "divisions"), + "s3_path": "s3://overturemaps-us-west-2/release/2025-02-19.0/theme=divisions/type=division/", + }, + "division_areas": { + "local_path": os.path.join(DATA_PATH, "division_areas"), + "s3_path": "s3://overturemaps-us-west-2/release/2025-02-19.0/theme=divisions/type=division_area/", + }, + } + + for data_type, paths in data_requirements.items(): + local_path = paths["local_path"] + s3_path = paths["s3_path"] + + # Check if directory exists and has parquet files + needs_download = False + + if not os.path.exists(local_path): + logger.info(f"Directory {local_path} does not exist") + needs_download = True + else: + # Check if directory has parquet files + parquet_files = glob.glob(os.path.join(local_path, "*.parquet")) + if not parquet_files: + logger.info( + f"Directory {local_path} exists but contains no parquet files" + ) + needs_download = True + else: + logger.info(f"Found {len(parquet_files)} parquet files in {local_path}") + + if needs_download: + logger.info(f"Downloading {data_type} data from S3...") + + # Create directory if it doesn't exist + os.makedirs(local_path, exist_ok=True) + + # Download from S3 using aws cli + try: + cmd = [ + "aws", + "s3", + "cp", + "--no-sign-request", + s3_path, + local_path, + "--recursive", + ] + + logger.info(f"Running command: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + + if result.stdout: + logger.info(f"Download output: {result.stdout}") + + # Verify download was successful + parquet_files = glob.glob(os.path.join(local_path, "*.parquet")) + if parquet_files: + logger.info( + f"Successfully downloaded {len(parquet_files)} parquet files to {local_path}" + ) + else: + raise Exception( + f"No parquet files found after download to {local_path}" + ) + + except subprocess.CalledProcessError as e: + logger.error(f"Error downloading {data_type} data from S3: {e}") + logger.error(f"Command output: {e.stdout}") + logger.error(f"Command error: {e.stderr}") + raise Exception(f"Failed to download {data_type} data from S3") + except FileNotFoundError: + logger.error( + "AWS CLI not found. Please install AWS CLI to download data from S3." + ) + logger.error("You can install it with: pip install awscli") + raise Exception("AWS CLI not available for data download") + + logger.info("Data availability check and download completed successfully") + + +def check_table_exists_with_data(table_name): + """Check if a table exists and has data""" + try: + with engine.begin() as conn: + # Check if table exists + result = conn.execute( + text( + """ + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_schema = 'public' + AND table_name = :table_name + ); + """ + ), + {"table_name": table_name}, + ) + table_exists = result.fetchone()[0] + + if not table_exists: + logger.info(f"Table '{table_name}' does not exist") + return False, 0 + + # Check if table has data + result = conn.execute(text(f"SELECT COUNT(*) FROM {table_name};")) + row_count = result.fetchone()[0] + + logger.info(f"Table '{table_name}' exists with {row_count:,} rows") + return True, row_count + + except Exception as e: + logger.error(f"Error checking table '{table_name}': {e}") + return False, 0 + + +def load_division_areas_in_batches(): + """Load division areas in batches to avoid memory issues""" + # Check if table exists and has data + table_exists, row_count = check_table_exists_with_data("division_areas") + + if table_exists and row_count > 0 and not FORCE_RECREATE: + logger.info( + f"Table 'division_areas' already exists with {row_count:,} rows. Skipping load." + ) + logger.info("Set FORCE_RECREATE=true to force recreation of the table.") + return row_count + + if FORCE_RECREATE and table_exists: + logger.info("FORCE_RECREATE is set. Will recreate division_areas table.") + + # Drop dependent view before replacing table + logger.info("Dropping dependent view before table replacement...") + with engine.begin() as conn: + conn.execute(text("DROP VIEW IF EXISTS all_geometries;")) + + logger.info("Starting to load division areas...") + + # Find all parquet files in the directory + parquet_files = glob.glob(f"{DATA_PATH}/division_areas/*.parquet") + logger.info(f"Found {len(parquet_files)} parquet files to process") + + total_areas = 0 + for file_path in parquet_files: + parquet_file = pq.ParquetFile(file_path) + total_areas += parquet_file.metadata.num_rows + + logger.info(f"Total division areas to process: {total_areas}") + + loaded_count = 0 + valid_count = 0 + batch_num = 0 + + # Process each parquet file + for file_idx, file_path in enumerate(parquet_files): + logger.info( + f"Processing file {file_idx + 1}/{len(parquet_files)}: {os.path.basename(file_path)}" + ) + + parquet_file = pq.ParquetFile(file_path) + + # Read in batches directly from parquet + for batch in parquet_file.iter_batches(batch_size=BATCH_SIZE): + # Convert to pandas DataFrame first + batch_df = batch.to_pandas() + + # Decode geometry from WKB binary format + if "geometry" in batch_df.columns: + try: + # Convert WKB binary to shapely geometries + batch_df["geometry"] = batch_df["geometry"].apply( + lambda x: wkb.loads(x) if x is not None else None + ) + # Create GeoDataFrame with proper geometry column and CRS + batch_df = gpd.GeoDataFrame( + batch_df, geometry="geometry", crs="EPSG:4326" + ) + except Exception as e: + logger.error(f"Error decoding geometry: {e}") + continue + else: + logger.warning("No geometry column found in batch, skipping...") + continue + + # Filter out null geometries and select only needed columns + batch_df = batch_df.loc[ + batch_df.geometry.notnull(), ["division_id", "geometry"] + ] + valid_batch_count = len(batch_df) + + if valid_batch_count > 0: # Only process if there are valid records + # Use replace for first batch with data, append for subsequent + if_exists = "replace" if loaded_count == 0 else "append" + + batch_df.to_postgis( + "division_areas", engine, if_exists=if_exists, index=False + ) + loaded_count += valid_batch_count + + valid_count += valid_batch_count + batch_num += 1 + + logger.info( + f"Processed batch {batch_num}: {valid_batch_count} valid areas, {loaded_count} total loaded" + ) + + logger.info( + f"Completed loading {loaded_count} division areas (from {total_areas} total)" + ) + return loaded_count + + +def load_divisions_in_batches(): + """Load divisions in batches with name processing""" + # Check if table exists and has data + table_exists, row_count = check_table_exists_with_data("divisions") + + if table_exists and row_count > 0 and not FORCE_RECREATE: + logger.info( + f"Table 'divisions' already exists with {row_count:,} rows. Skipping load." + ) + logger.info("Set FORCE_RECREATE=true to force recreation of the table.") + return row_count + + if FORCE_RECREATE and table_exists: + logger.info("FORCE_RECREATE is set. Will recreate divisions table.") + + logger.info("Starting to load divisions...") + + # Find all parquet files in the directory + parquet_files = glob.glob(f"{DATA_PATH}/divisions/*.parquet") + logger.info(f"Found {len(parquet_files)} parquet files to process") + + total_divs = 0 + for file_path in parquet_files: + parquet_file = pq.ParquetFile(file_path) + total_divs += parquet_file.metadata.num_rows + + logger.info(f"Total divisions to process: {total_divs}") + + loaded_count = 0 + batch_num = 0 + + columns_to_load = ["id", "subtype", "names", "country", "hierarchies"] + logger.info(f"Only loading specified columns: {columns_to_load}") + + # Drop dependent view before replacing table + logger.info("Dropping dependent view before table replacement...") + with engine.begin() as conn: + conn.execute(text("DROP VIEW IF EXISTS all_geometries;")) + + # Process each parquet file + for file_idx, file_path in enumerate(parquet_files): + logger.info( + f"Processing file {file_idx + 1}/{len(parquet_files)}: {os.path.basename(file_path)}" + ) + + parquet_file = pq.ParquetFile(file_path) + + # Read in batches directly from parquet + for batch in parquet_file.iter_batches( + batch_size=BATCH_SIZE, columns=columns_to_load + ): + batch_df = pd.DataFrame(batch.to_pandas()) + + # Process names + batch_df["primary_name"] = batch_df.names.apply( + lambda n: n.get("primary") if n and isinstance(n, dict) else None + ) + batch_df["common_en_name"] = batch_df.names.apply(get_common_en_name) + + # Columns with complex objects that need to be serialized to JSON + json_cols = [ + "names", + "hierarchies", + ] + + for col in json_cols: + if col in batch_df.columns: + batch_df[col] = batch_df[col].apply( + lambda x: ( + json.dumps(x, cls=NumpyAwareJSONEncoder) + if x is not None + else None + ) + ) + + # Use replace for first batch, append for subsequent + if_exists = "replace" if loaded_count == 0 else "append" + + batch_df.to_sql("divisions", engine, if_exists=if_exists, index=False) + batch_count = len(batch_df) + loaded_count += batch_count + batch_num += 1 + + logger.info( + f"Loaded batch {batch_num}: {batch_count} divisions, {loaded_count} total loaded" + ) + + logger.info(f"Completed loading {loaded_count} divisions") + return loaded_count + + +def create_combined_view(): + """Create a view that combines divisions with their geometries""" + logger.info("Creating combined view...") + + with engine.begin() as conn: + # Drop existing view if it exists + conn.execute(text("DROP VIEW IF EXISTS all_geometries;")) + + # Create view that joins divisions with their areas + create_view_sql = """ + CREATE VIEW all_geometries AS + SELECT + d.id, + d.subtype, + d.names, + d.country, + d.hierarchies, + d.primary_name, + d.common_en_name, + da.geometry, + 'division' as source_type + FROM divisions d + INNER JOIN division_areas da ON d.id = da.division_id + WHERE da.geometry IS NOT NULL; + """ + + conn.execute(text(create_view_sql)) + + # Get count of combined records + result = conn.execute(text("SELECT COUNT(*) FROM all_geometries;")) + combined_count = result.fetchone()[0] + + logger.info(f"Created combined view with {combined_count} records") + return combined_count + + +def check_common_name_data(): + """Check if common_en_name column has any data""" + logger.info("Checking common_en_name data availability...") + + try: + with engine.begin() as conn: + # Check total count + result = conn.execute(text("SELECT COUNT(*) FROM divisions;")) + total_count = result.fetchone()[0] + + # Check how many have non-null common_en_name + result = conn.execute( + text("SELECT COUNT(*) FROM divisions WHERE common_en_name IS NOT NULL;") + ) + common_name_count = result.fetchone()[0] + + # Check how many have non-empty common_en_name + result = conn.execute( + text( + "SELECT COUNT(*) FROM divisions WHERE common_en_name IS NOT NULL AND common_en_name != '';" + ) + ) + non_empty_common_name_count = result.fetchone()[0] + + # Get some samples of common names + result = conn.execute( + text( + "SELECT primary_name, common_en_name FROM divisions WHERE common_en_name IS NOT NULL AND common_en_name != '' LIMIT 10;" + ) + ) + samples = result.fetchall() + + logger.info(f"Total divisions: {total_count}") + logger.info(f"Divisions with non-null common_en_name: {common_name_count}") + logger.info( + f"Divisions with non-empty common_en_name: {non_empty_common_name_count}" + ) + logger.info( + f"Percentage with common names: {(non_empty_common_name_count / total_count) * 100:.2f}%" + ) + + if samples: + logger.info("Sample common names:") + for i, sample in enumerate(samples, 1): + logger.info( + f" {i}. Primary: '{sample.primary_name}' -> Common: '{sample.common_en_name}'" + ) + else: + logger.info("No samples found - common_en_name appears to be empty!") + + # Let's also check the raw names structure + result = conn.execute( + text("SELECT names FROM divisions WHERE names IS NOT NULL LIMIT 5;") + ) + name_samples = result.fetchall() + + logger.info("Sample raw names structures:") + for i, sample in enumerate(name_samples, 1): + try: + names_obj = json.loads(sample.names) if sample.names else {} + logger.info(f" {i}. Names structure: {names_obj}") + # Test our extraction function + common_name = get_common_en_name(names_obj) + logger.info(f" Extracted common name: {common_name}") + except Exception as e: + logger.error(f" Error parsing names: {e}") + + except Exception as e: + logger.error(f"Error checking common name data: {str(e)}") + + +def test_query(place_name): + """ + Search for places using trigram similarity on common_en_name and primary_name. + Returns top 10 matches with similarity scores. + """ + logger.info(f"Searching for places similar to: '{place_name}'") + + # First ensure pg_trgm extension is available + with engine.begin() as conn: + try: + conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;")) + except Exception as e: + logger.error(f"Could not enable pg_trgm extension: {e}") + logger.error("pg_trgm extension is required for similarity search") + return [] + + # Query with trigram similarity search on both name fields + search_sql = """ + SELECT + id, + subtype, + country, + primary_name, + common_en_name, + -- Calculate similarity scores for both fields + GREATEST( + COALESCE(SIMILARITY(primary_name, :place_name), 0), + COALESCE(SIMILARITY(common_en_name, :place_name), 0) + ) as similarity_score, + -- Show which field matched better + CASE + WHEN COALESCE(SIMILARITY(primary_name, :place_name), 0) >= + COALESCE(SIMILARITY(common_en_name, :place_name), 0) + THEN 'primary_name' + ELSE 'common_en_name' + END as best_match_field, + -- Simplified geometry as GeoJSON (only for top result) + ST_AsGeoJSON(ST_Simplify(geometry, 0.05)) as simplified_geometry + FROM all_geometries + WHERE + -- Use trigram similarity operator (% means similar to) + (primary_name % :place_name OR common_en_name % :place_name) + ORDER BY similarity_score DESC, subtype + LIMIT 10; + """ + + try: + with engine.begin() as conn: + result = conn.execute(text(search_sql), {"place_name": place_name}) + + matches = result.fetchall() + + if matches: + logger.info(f"Found {len(matches)} matches:") + for i, match in enumerate(matches, 1): + # Prioritize common name if available, otherwise use primary name + display_name = match.common_en_name or match.primary_name or "N/A" + alt_name = "" + + # Show alternative name in parentheses if different from display name + if ( + match.common_en_name + and match.primary_name + and match.common_en_name != match.primary_name + ): + alt_name = f" (also: {match.primary_name})" + elif not match.common_en_name and match.primary_name: + alt_name = " (primary name only)" + + logger.info( + f"{i:2d}. {display_name}{alt_name} " + f"[{match.subtype}] [{match.country or 'N/A'}] " + f"- Score: {match.similarity_score:.3f} " + f"(via {match.best_match_field})" + ) + + # Show simplified geometry only for the top match + if i == 1 and match.simplified_geometry: + # Truncate very long geometries for readability + geom_text = match.simplified_geometry + logger.info(f" Geometry: {geom_text}") + + return matches + else: + logger.info("No matches found") + return [] + + except Exception as e: + logger.error(f"Error during trigram similarity search: {str(e)}") + return [] + + +def create_trigram_indexes(): + """Create trigram indexes for faster similarity searches on name columns""" + logger.info("Creating trigram indexes for faster similarity searches...") + + try: + with engine.begin() as conn: + # Ensure pg_trgm extension is available + conn.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;")) + logger.info("pg_trgm extension is available") + + # Create GIN indexes for trigram similarity on name columns + # Check if indexes already exist to avoid errors + index_queries = [ + "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_divisions_primary_name_trgm ON divisions USING gin (primary_name gin_trgm_ops);", + "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_divisions_common_en_name_trgm ON divisions USING gin (common_en_name gin_trgm_ops);", + # Also create standard indexes for other common query patterns + "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_divisions_subtype ON divisions (subtype);", + "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_divisions_country ON divisions (country);", + "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_division_areas_division_id ON division_areas (division_id);", + ] + + for idx, query in enumerate(index_queries, 1): + try: + logger.info( + f"Creating index {idx}/{len(index_queries)}: {query.split()[5] if len(query.split()) > 5 else 'unknown'}" + ) + # Note: CONCURRENTLY cannot be used inside a transaction, so we need separate connections + conn.execute( + text(query.replace("CONCURRENTLY ", "")) + ) # Remove CONCURRENTLY for transaction compatibility + except Exception as e: + logger.warning(f"Index creation warning (may already exist): {e}") + + # Check what indexes were created + result = conn.execute( + text( + """ + SELECT indexname, tablename + FROM pg_indexes + WHERE tablename IN ('divisions', 'division_areas') + AND indexname LIKE '%trgm%' + ORDER BY tablename, indexname; + """ + ) + ) + trigram_indexes = result.fetchall() + + if trigram_indexes: + logger.info("Trigram indexes found:") + for idx in trigram_indexes: + logger.info(f" {idx.tablename}.{idx.indexname}") + else: + logger.warning("No trigram indexes found after creation") + + # Show all indexes on these tables + result = conn.execute( + text( + """ + SELECT indexname, tablename, indexdef + FROM pg_indexes + WHERE tablename IN ('divisions', 'division_areas') + ORDER BY tablename, indexname; + """ + ) + ) + all_indexes = result.fetchall() + + logger.info( + f"All indexes on divisions and division_areas tables ({len(all_indexes)} total):" + ) + for idx in all_indexes: + logger.info(f" {idx.tablename}.{idx.indexname}") + + except Exception as e: + logger.error(f"Error creating trigram indexes: {str(e)}") + raise + + +def main(): + """Main execution function""" + logger.info("Starting geodini data ingestion...") + + if FORCE_RECREATE: + logger.info("FORCE_RECREATE is enabled - will recreate all tables") + else: + logger.info( + "FORCE_RECREATE is disabled - will skip tables that already have data" + ) + + # Test database connection first + if not test_database_connection(): + logger.error("Database connection test failed. Exiting.") + sys.exit(1) + + logger.info("Database connection test passed. Proceeding with ingestion...") + + try: + # Check and download data if needed + check_and_download_data() + + # Load division areas in batches + areas_count = load_division_areas_in_batches() + + # Load divisions in batches + divs_count = load_divisions_in_batches() + + # Create combined view + combined_count = create_combined_view() + + logger.info("=== INGESTION SUMMARY ===") + logger.info(f"Division areas available: {areas_count:,}") + logger.info(f"Divisions available: {divs_count:,}") + logger.info(f"Combined records available: {combined_count:,}") + logger.info("Ingestion completed successfully!") + + # Check common name data availability + logger.info("\n=== CHECKING COMMON NAME DATA ===") + check_common_name_data() + + # Test the search functionality + logger.info("\n=== TESTING SEARCH FUNCTIONALITY ===") + test_places = ["London", "Paris", "New York", "Tokyo"] + for place in test_places: + logger.info(f"\n--- Testing search for '{place}' ---") + results = test_query(place) + if not results: + logger.info("No results found") + logger.info("---") + + # Create trigram indexes + create_trigram_indexes() + + except Exception as e: + logger.error(f"Error during ingestion: {str(e)}") + raise + + +if __name__ == "__main__": + main() diff --git a/geodini/lib.py b/geodini/lib.py index 4fe5044..d40f22b 100644 --- a/geodini/lib.py +++ b/geodini/lib.py @@ -1,6 +1,7 @@ from collections.abc import Callable from typing import Any +from geodini.agents.utils.geocoder import geocode as overture_divisions_geocode from geodini import agents as geodini_agents @@ -9,4 +10,4 @@ def get_geocoders( geocoders, ) -> list[Callable[[str, int | None], list[dict[str, Any]]]]: """Get a list of geocoders""" - return [geodini_agents.utils.geocoder.geocode] + return [overture_divisions_geocode] diff --git a/helm/geodini/Chart.yaml b/helm/geodini/Chart.yaml new file mode 100644 index 0000000..e92ffc1 --- /dev/null +++ b/helm/geodini/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: geodini +description: A Helm chart for the Geodini application +type: application +version: 0.1.0 +appVersion: "0.1.0" diff --git a/helm/geodini/templates/NOTES.txt b/helm/geodini/templates/NOTES.txt new file mode 100644 index 0000000..5f44236 --- /dev/null +++ b/helm/geodini/templates/NOTES.txt @@ -0,0 +1,41 @@ +{{- $fullName := include "geodini.fullname" . -}} +{{- $apiServiceName := printf "%s-api" $fullName -}} +{{- $postgresServiceName := printf "%s-postgres" $fullName -}} +{{- $apiServicePort := .Values.api.service.port -}} + +Your Geodini application has been deployed. + +API Service: {{ $apiServiceName }} +{{- if .Values.postgres.enabled }} +PostgreSQL Service: {{ $postgresServiceName }} (internal) +{{- end }} + +{{- if .Values.ingress.enabled }} +To access your application via Ingress: + {{- if .Values.ingress.apiHost }} + {{- $apiHost := .Values.ingress.apiHost -}} + {{- $apiProtocol := "http" -}} + {{- range .Values.ingress.tls -}} + {{- if and .hosts (has $apiHost .hosts) -}} + {{- $apiProtocol = "https" -}} + {{- end -}} + {{- end }} + API: {{ printf "%s://%s" $apiProtocol $apiHost }} + {{- end }} +{{- else }} +To access your application, you might need to set up port-forwarding: + + kubectl port-forward svc/{{ $apiServiceName }} {{ $apiServicePort }}:{{ $apiServicePort }} + {{- if .Values.postgres.enabled }} + # For PostgreSQL (if needed for direct access, typically not required by end-users) + # kubectl port-forward svc/{{ $postgresServiceName }} 5432:{{ .Values.postgres.service.port }} + {{- end }} + +And the API at http://localhost:{{ $apiServicePort }} +{{- end }} + +{{- if .Values.postgres.enabled }} +The PostgreSQL password is set in the '{{ $fullName }}-geodini-secret' secret. +Default user: {{ .Values.postgres.user }} +Default database: {{ .Values.postgres.database }} +{{- end }} diff --git a/helm/geodini/templates/_helpers.tpl b/helm/geodini/templates/_helpers.tpl new file mode 100644 index 0000000..301d828 --- /dev/null +++ b/helm/geodini/templates/_helpers.tpl @@ -0,0 +1,73 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "geodini.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "geodini.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "geodini.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "geodini.labels" -}} +helm.sh/chart: {{ include "geodini.chart" . }} +{{ include "geodini.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "geodini.selectorLabels" -}} +app.kubernetes.io/name: {{ include "geodini.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "geodini.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (include "geodini.fullname" .) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Return the appropriate apiVersion for deployment. +*/}} +{{- define "geodini.deployment.apiVersion" -}} +{{- if semverCompare ">=1.9-0" .Capabilities.KubeVersion.GitVersion -}} +apps/v1 +{{- else -}} +apps/v1beta2 +{{- end -}} +{{- end -}} diff --git a/helm/geodini/templates/api-deployment.yaml b/helm/geodini/templates/api-deployment.yaml new file mode 100644 index 0000000..b43dd9b --- /dev/null +++ b/helm/geodini/templates/api-deployment.yaml @@ -0,0 +1,131 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "geodini.fullname" . }}-api + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: api +spec: + serviceName: {{ include "geodini.fullname" . }}-api-headless # For stable pod identities + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "geodini.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: api + template: + metadata: + labels: + {{- include "geodini.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: api + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "geodini.serviceAccountName" . }} + securityContext: + {} + volumes: + - name: temp-dir + emptyDir: {} + {{- if .Values.api.initContainer.ingest.enabled }} + initContainers: + - name: init-ingest-data + image: "{{ .Values.api.image.repository }}:{{ .Values.api.image.tag }}" + imagePullPolicy: {{ .Values.api.image.pullPolicy }} + command: {{ toJson .Values.api.initContainer.ingest.command }} + env: + - name: POSTGRES_HOST + value: {{ .Values.api.env.POSTGRES_HOST | quote }} + - name: POSTGRES_PORT + value: {{ .Values.api.env.POSTGRES_PORT | quote }} + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_USER + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_PASSWORD + - name: POSTGRES_DB + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_DB + - name: DATA_PATH + value: /tmp/data + {{- if .Values.api.initContainer.ingest.forceRecreate }} + - name: FORCE_RECREATE + value: "true" + {{- end }} + volumeMounts: + - name: temp-dir + mountPath: /tmp + resources: + {{- toYaml .Values.api.initContainer.ingest.resources | nindent 12 }} + {{- end }} + containers: + - name: api + securityContext: + {} + image: "{{ .Values.api.image.repository }}:{{ .Values.api.image.tag }}" + imagePullPolicy: {{ .Values.api.image.pullPolicy }} + command: + - "uvicorn" + - "geodini.api.api:app" + - "--host" + - "0.0.0.0" + - "--port" + - {{ .Values.api.port | quote }} + - "--workers" + - "4" + ports: + - name: http + containerPort: {{ .Values.api.port }} + protocol: TCP + env: + - name: PORT + value: {{ .Values.api.port | quote }} + - name: POSTGRES_HOST + value: {{ .Values.api.env.POSTGRES_HOST | quote }} + - name: POSTGRES_PORT + value: {{ .Values.api.env.POSTGRES_PORT | quote }} + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_USER + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_PASSWORD + - name: POSTGRES_DB + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_DB + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: OPENAI_API_KEY + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 45 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 45 + periodSeconds: 20 + resources: + {{- toYaml .Values.api.resources | nindent 12 }} + volumeMounts: + - name: temp-dir + mountPath: /tmp diff --git a/helm/geodini/templates/api-headless-service.yaml b/helm/geodini/templates/api-headless-service.yaml new file mode 100644 index 0000000..ed557b5 --- /dev/null +++ b/helm/geodini/templates/api-headless-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "geodini.fullname" . }}-api-headless + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: api +spec: + clusterIP: None # Defines this as a Headless Service + ports: + - port: {{ .Values.api.port }} # Port the service will expose (same as container port for headless) + targetPort: http # Name of the port in the Pod spec + protocol: TCP + name: http + selector: + {{- include "geodini.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: api diff --git a/helm/geodini/templates/api-service.yaml b/helm/geodini/templates/api-service.yaml new file mode 100644 index 0000000..511acc5 --- /dev/null +++ b/helm/geodini/templates/api-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "geodini.fullname" . }}-api + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: api +spec: + type: {{ .Values.api.service.type }} + ports: + - port: {{ .Values.api.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "geodini.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: api diff --git a/helm/geodini/templates/geodini-secret.yaml b/helm/geodini/templates/geodini-secret.yaml new file mode 100644 index 0000000..f60f52c --- /dev/null +++ b/helm/geodini/templates/geodini-secret.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "geodini.fullname" . }}-geodini-secret + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: backend +type: Opaque +data: + POSTGRES_USER: {{ .Values.secrets.POSTGRES_USER | b64enc | quote }} + POSTGRES_PASSWORD: {{ .Values.secrets.POSTGRES_PASSWORD | b64enc | quote }} + POSTGRES_DB: {{ .Values.secrets.POSTGRES_DB | b64enc | quote }} + OPENAI_API_KEY: {{ .Values.secrets.OPENAI_API_KEY | b64enc | quote }} diff --git a/helm/geodini/templates/ingress.yaml b/helm/geodini/templates/ingress.yaml new file mode 100644 index 0000000..84e97f9 --- /dev/null +++ b/helm/geodini/templates/ingress.yaml @@ -0,0 +1,39 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "geodini.fullname" . }} + labels: + {{- include "geodini.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- if .Values.ingress.apiHost }} + - host: {{ .Values.ingress.apiHost | quote }} + http: + paths: + - path: / + pathType: ImplementationSpecific + backend: + service: + name: {{ include "geodini.fullname" . }}-api + port: + number: {{ .Values.api.service.port }} + {{- end }} +{{- end }} diff --git a/helm/geodini/templates/postgres-deployment.yaml b/helm/geodini/templates/postgres-deployment.yaml new file mode 100644 index 0000000..e3731b9 --- /dev/null +++ b/helm/geodini/templates/postgres-deployment.yaml @@ -0,0 +1,81 @@ +{{- if .Values.postgres.enabled }} +apiVersion: {{ include "geodini.deployment.apiVersion" . }} +kind: Deployment +metadata: + name: {{ include "geodini.fullname" . }}-postgres + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: postgres +spec: + replicas: 1 # PostgreSQL typically runs as a single replica with PVC + selector: + matchLabels: + {{- include "geodini.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: postgres + template: + metadata: + labels: + {{- include "geodini.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: postgres + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "geodini.serviceAccountName" . }} + containers: + - name: postgres + image: "{{ .Values.postgres.image.repository }}:{{ .Values.postgres.image.tag }}" + imagePullPolicy: {{ .Values.postgres.image.pullPolicy }} + ports: + - name: postgres + containerPort: {{ .Values.postgres.port }} + protocol: TCP + env: + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_USER + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_PASSWORD + - name: POSTGRES_DB # Or POSTGRES_INITDB_ARGS for more complex init + valueFrom: + secretKeyRef: + name: {{ include "geodini.fullname" . }}-geodini-secret + key: POSTGRES_DB + - name: PGDATA + value: /var/lib/postgresql/data/pgdata # Standard for postgres images + livenessProbe: + exec: + command: + - pg_isready + - -U + - $(POSTGRES_USER) + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + readinessProbe: + exec: + command: + - pg_isready + - -U + - $(POSTGRES_USER) + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + resources: + {} # Define resources + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + volumes: + - name: postgres-data + persistentVolumeClaim: + claimName: {{ include "geodini.fullname" . }}-postgres-data +{{- end }} diff --git a/helm/geodini/templates/postgres-pvc.yaml b/helm/geodini/templates/postgres-pvc.yaml new file mode 100644 index 0000000..94239a9 --- /dev/null +++ b/helm/geodini/templates/postgres-pvc.yaml @@ -0,0 +1,18 @@ +{{- if and .Values.postgres.enabled .Values.postgres.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "geodini.fullname" . }}-postgres-data + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: postgres +spec: + accessModes: + {{- toYaml .Values.postgres.persistence.accessModes | nindent 4 }} + resources: + requests: + storage: {{ .Values.postgres.persistence.size | quote }} + {{- if .Values.postgres.persistence.storageClassName }} + storageClassName: {{ .Values.postgres.persistence.storageClassName | quote }} + {{- end }} +{{- end }} diff --git a/helm/geodini/templates/postgres-service.yaml b/helm/geodini/templates/postgres-service.yaml new file mode 100644 index 0000000..ae46ff9 --- /dev/null +++ b/helm/geodini/templates/postgres-service.yaml @@ -0,0 +1,19 @@ +{{- if .Values.postgres.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "geodini.fullname" . }}-postgres + labels: + {{- include "geodini.labels" . | nindent 4 }} + app.kubernetes.io/component: postgres +spec: + type: {{ .Values.postgres.service.type }} + ports: + - port: {{ .Values.postgres.service.port }} + targetPort: postgres + protocol: TCP + name: postgres + selector: + {{- include "geodini.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: postgres +{{- end }} diff --git a/helm/geodini/templates/serviceaccount.yaml b/helm/geodini/templates/serviceaccount.yaml new file mode 100644 index 0000000..ef3c563 --- /dev/null +++ b/helm/geodini/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "geodini.serviceAccountName" . }} + labels: + {{- include "geodini.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end -}} diff --git a/helm/geodini/values.yaml b/helm/geodini/values.yaml new file mode 100644 index 0000000..d0c4fd6 --- /dev/null +++ b/helm/geodini/values.yaml @@ -0,0 +1,91 @@ +# Default values for geodini-chart. +replicaCount: 1 + +api: + image: + repository: ghcr.io/sunu/geodini/api + pullPolicy: IfNotPresent + tag: "latest" + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + memory: "4Gi" + port: 9000 + service: + type: ClusterIP + port: 9000 + env: + PORT: "9000" + POSTGRES_HOST: "geodini-postgres" # Service name of our postgres + POSTGRES_PORT: "5432" + # POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB are taken from a secret + initContainer: + ingest: + command: ["python", "geodini/ingest.py"] + resources: + requests: + memory: "2Gi" + limits: + memory: "4Gi" + forceRecreate: false + +# PostgreSQL/PostGIS configuration +postgres: + enabled: true + image: + repository: postgis/postgis + tag: "16-3.4" # Using 16 as 17 is not yet in postgis/postgis official images + pullPolicy: IfNotPresent + port: 5432 + service: + type: ClusterIP # Internal service + port: 5432 + # user, password, database configured in 'secrets' + persistence: + enabled: true + storageClassName: "" # Or your specific storage class + accessModes: + - ReadWriteOnce + size: 40Gi # Adjust as needed + +ingress: + enabled: false + className: "nginx" # Or your ingress controller's class name + annotations: {} + # kubernetes.io/ingress.class: nginx + # cert-manager.io/cluster-issuer: letsencrypt-prod # Example for cert-manager + + # API Ingress Configuration + apiHost: "api.geodini.local" # Placeholder, e.g., api.yourdomain.com + + # Common TLS configuration. + # You can define one or more secrets. + # If you have separate certs for frontendHost and apiHost, list them here. + # If you have a wildcard cert covering both, one entry is sufficient. + tls: [] + # Example for separate certs: + # - secretName: geodini-api-tls + # hosts: + # - api.geodini.local # Must match ingress.apiHost + # Example for a wildcard cert: + # - secretName: geodini-wildcard-tls + # hosts: + # - api.geodini.local + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + create: true + annotations: {} + name: "" + +# Centralized secrets +secrets: + POSTGRES_USER: "postgres" + POSTGRES_PASSWORD: "changeme" # IMPORTANT: Change for production + POSTGRES_DB: "postgres" + OPENAI_API_KEY: "YOUR_OPENAI_API_KEY_HERE" # IMPORTANT: Change for production diff --git a/pyproject.toml b/pyproject.toml index 3e7e52b..7b4b406 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,8 +5,8 @@ description = "A natural language geocoding API." authors = [] license = "MIT" dependencies = [ + "awscli", "dotenv", - "duckdb>=0.9.0", "fastapi>=0.104.0", "numpy>=1.25.0", "openai>=1.0.0", @@ -19,7 +19,10 @@ dependencies = [ "rich>=13.0.0", "shapely>=2.0.0", "typer>=0.9.0", - "uvicorn>=0.23.0" + "uvicorn>=0.23.0", + "geopandas", + "geoalchemy2", + "pyarrow", ] requires-python = ">=3.10" diff --git a/scripts/create_db.sql b/scripts/create_db.sql deleted file mode 100644 index 99c48b9..0000000 --- a/scripts/create_db.sql +++ /dev/null @@ -1,53 +0,0 @@ --- Attach the database -ATTACH 'data/overture.duckdb' AS db; - --- Install and Load the FTS extension -INSTALL fts; -LOAD fts; - --- Create and populate the 'divisions' table -CREATE OR REPLACE TABLE db.divisions AS -SELECT - id, - subtype, - names, - country, - hierarchies, - population, - names->>'primary' AS primary_name, - names->'common'->>'en' AS common_en_name -FROM read_parquet('data/divisions.parquet') -ORDER BY names->>'primary'; - --- Create and populate the 'division_areas' table -CREATE OR REPLACE TABLE db.division_areas AS -SELECT id, division_id, geometry FROM read_parquet('data/division_areas.parquet'); - --- Create FTS index on 'divisions' table for fast name search -PRAGMA create_fts_index('db.divisions', 'id', 'primary_name', 'common_en_name'); - - --- -- Create and populate the 'land' table --- CREATE OR REPLACE TABLE db.land AS --- SELECT --- id, --- subtype, --- names, --- class, --- geometry, --- names->>'primary' AS primary_name, --- names->'common'->>'en' AS common_en_name --- FROM read_parquet('data/land.parquet'); - --- -- Create an FTS table for 'land' names --- CREATE VIRTUAL TABLE db.land_fts USING fts5(primary_name, common_en_name); - --- -- Populate the FTS table with land names --- INSERT INTO db.land_fts (primary_name, common_en_name) --- SELECT primary_name, common_en_name FROM db.land; - --- -- Create a view for land names using FTS --- CREATE OR REPLACE VIEW db.all_land_names AS --- SELECT l.id, l.primary_name, l.common_en_name, l.subtype, l.class, l.geometry --- FROM db.land l --- JOIN db.land_fts fts ON l.primary_name = fts.primary_name OR l.common_en_name = fts.common_en_name; diff --git a/scripts/create_unified_db.sql b/scripts/create_unified_db.sql deleted file mode 100644 index c2ea357..0000000 --- a/scripts/create_unified_db.sql +++ /dev/null @@ -1,43 +0,0 @@ --- Attach the database -ATTACH 'data/overture-unified.duckdb' AS db; - --- Install and Load the FTS extension -INSTALL fts; -LOAD fts; - --- Create and populate the 'all_geometries' table -CREATE OR REPLACE TABLE db.all_geometries AS --- Division data -SELECT - d.id, - d.subtype, - d.names, - d.country, - d.hierarchies, - d.names->>'primary' AS primary_name, - d.names->'common'->>'en' AS common_en_name, - da.geometry, - 'division' AS source_type -FROM read_parquet('data/divisions.parquet') d -LEFT JOIN read_parquet('data/division_areas.parquet') da ON d.id = da.division_id -WHERE da.geometry IS NOT NULL - -UNION ALL - --- Land data -SELECT - id, - subtype, - names, - NULL as country, - NULL as hierarchies, - names->>'primary' AS primary_name, - names->'common'->>'en' AS common_en_name, - geometry, - 'land' AS source_type -FROM read_parquet('data/land.parquet') -WHERE geometry IS NOT NULL -AND names IS NOT NULL; - --- Create FTS index on 'all_geometries' table for fast name search --- PRAGMA create_fts_index('db.all_geometries', 'id', 'primary_name', 'common_en_name'); diff --git a/scripts/download_overture_data.sql b/scripts/download_overture_data.sql deleted file mode 100644 index 7c6d93e..0000000 --- a/scripts/download_overture_data.sql +++ /dev/null @@ -1,54 +0,0 @@ -LOAD spatial; -- noqa -LOAD httpfs; -- noqa - --- Access the data on AWS -SET s3_region='us-west-2'; - -SELECT 'Starting Overture data download...' as message; - --- -- Download divisions --- SELECT 'Downloading divisions...' as message; --- COPY ( --- SELECT * --- FROM read_parquet('s3://overturemaps-us-west-2/release/2025-02-19.0/theme=divisions/type=division/*') --- ) TO 'data/divisions.parquet'; - --- SELECT 'Divisions download complete.' as message; - --- -- Download division areas --- SELECT 'Downloading division areas...' as message; --- COPY ( --- SELECT * --- FROM read_parquet('s3://overturemaps-us-west-2/release/2025-02-19.0/theme=divisions/type=division_area/*') --- ) TO 'data/division_areas.parquet'; - --- SELECT 'Division areas download complete.' as message; - --- Download land data -SELECT 'Downloading land data...' as message; -COPY ( - SELECT * - FROM read_parquet('s3://overturemaps-us-west-2/release/2025-02-19.0/theme=base/type=land/*') -) TO 'data/land.parquet'; - -SELECT 'Land data download complete.' as message; - --- Download water data -SELECT 'Downloading water data...' as message; -COPY ( - SELECT * - FROM read_parquet('s3://overturemaps-us-west-2/release/2025-02-19.0/theme=base/type=water/*') -) TO 'data/water.parquet'; - -SELECT 'Water data download complete.' as message; - --- Download infrastructure data -SELECT 'Downloading infrastructure data...' as message; -COPY ( - SELECT * - FROM read_parquet('s3://overturemaps-us-west-2/release/2025-02-19.0/theme=base/type=infrastructure/*') -) TO 'data/infrastructure.parquet'; - -SELECT 'Infrastructure data download complete.' as message; - -SELECT 'All downloads finished successfully!' as message; diff --git a/scripts/init-data.sh b/scripts/init-data.sh deleted file mode 100755 index 097a031..0000000 --- a/scripts/init-data.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -mkdir -p /data && -echo 'Starting Overture data initialization...' && -# check if the parquet files exist -if [ ! -f /data/divisions.parquet ] || [ ! -f /data/division_areas.parquet ]; then - duckdb -c '.read /scripts/download_overture_data.sql' && - echo 'Overture data download complete.' -else - echo 'Overture data already exists.' -fi -echo 'Creating DuckDB database...' && -# check if the file exists -if [ ! -f /data/overture-unified.duckdb ]; then - duckdb /data/overture-unified.duckdb -c '.read /scripts/create_unified_db.sql' && - echo 'DuckDB database creation complete.' -else - echo 'DuckDB database already exists.' -fi