diff --git a/bmsdna/lakeapi/core/config.py b/bmsdna/lakeapi/core/config.py index 65cab25..f37cda5 100644 --- a/bmsdna/lakeapi/core/config.py +++ b/bmsdna/lakeapi/core/config.py @@ -53,6 +53,7 @@ class BasicConfig: token_retrieval_func: "Optional[Callable[[SourceUri, str], TokenCredential]]" should_hide_col_name: Callable[[str], bool] default_copy_local: bool = False + max_route_init_time: int = 200 # seconds def _should_hide_colname(name: str): @@ -79,6 +80,7 @@ def get_default_config(): schema_cache_ttl=5 * 60, # 5 minutes token_retrieval_func=None, should_hide_col_name=_should_hide_colname, + max_route_init_time=200, ) diff --git a/bmsdna/lakeapi/core/route.py b/bmsdna/lakeapi/core/route.py index 5438c41..b60ca01 100644 --- a/bmsdna/lakeapi/core/route.py +++ b/bmsdna/lakeapi/core/route.py @@ -1,3 +1,4 @@ +from time import time from typing import Literal, Tuple, cast from fastapi import APIRouter @@ -24,6 +25,7 @@ def init_routes(configs: Configs, basic_config: BasicConfig): all_lake_api_routers.append((basic_config, configs)) router = APIRouter() metadata = [] + now_time = time() with ExecutionContextManager( basic_config.default_engine, basic_config.default_chunk_size, @@ -34,59 +36,67 @@ def init_routes(configs: Configs, basic_config: BasicConfig): if isinstance(config.api_method, str) else config.api_method ) - try: - from bmsdna.lakeapi.core.datasource import Datasource - - assert config.datasource is not None - with Datasource( - config.version_str, - config.tag, - config.name, - config=config.datasource, - sql_context=mgr.get_context(config.engine), - basic_config=basic_config, - accounts=configs.accounts, - ) as realdataframe: - try: - schema = get_schema_cached( - basic_config, - realdataframe, - config.datasource.get_unique_hash(), - ) - if schema is None: - logger.warning( - f"Could not get response type for {config.route}. Path does not exist:{realdataframe}" - ) - except Exception as err: - logger.warning( - f"Could not get schema for {config.route}. Error:{err}", - exc_info=err, - ) - schema = None - - metadata.append( - { - "name": config.name, - "tag": config.tag, - "route": config.route, - "methods": methods, - "file_type": config.datasource.file_type, - "uri": config.datasource.uri, - "version": config.version, - "schema": {n: str(schema.field(n).type) for n in schema.names} - if schema - else None, - } - ) - - except Exception as err: - import traceback - - print(traceback.format_exc()) + if time() - now_time > basic_config.max_route_init_time: logger.warning( - f"Could not get response type for f{config.route}. Error:{err}" + f"Route initialization time exceeded {basic_config.max_route_init_time}s. Stopping further route initialization." ) schema = None + else: + try: + from bmsdna.lakeapi.core.datasource import Datasource + + assert config.datasource is not None + with Datasource( + config.version_str, + config.tag, + config.name, + config=config.datasource, + sql_context=mgr.get_context(config.engine), + basic_config=basic_config, + accounts=configs.accounts, + ) as realdataframe: + try: + schema = get_schema_cached( + basic_config, + realdataframe, + config.datasource.get_unique_hash(), + ) + if schema is None: + logger.warning( + f"Could not get response type for {config.route}. Path does not exist:{realdataframe}" + ) + except Exception as err: + logger.warning( + f"Could not get schema for {config.route}. Error:{err}", + exc_info=err, + ) + schema = None + + metadata.append( + { + "name": config.name, + "tag": config.tag, + "route": config.route, + "methods": methods, + "file_type": config.datasource.file_type, + "uri": config.datasource.uri, + "version": config.version, + "schema": { + n: str(schema.field(n).type) for n in schema.names + } + if schema + else None, + } + ) + + except Exception as err: + import traceback + + print(traceback.format_exc()) + logger.warning( + f"Could not get response type for f{config.route}. Error:{err}" + ) + schema = None response_model = ( get_response_model( diff --git a/bmsdna/lakeapi/utils/meta_cache.py b/bmsdna/lakeapi/utils/meta_cache.py index e7ca90b..279deb9 100644 --- a/bmsdna/lakeapi/utils/meta_cache.py +++ b/bmsdna/lakeapi/utils/meta_cache.py @@ -8,8 +8,6 @@ DeltaTableMeta, duckdb_apply_storage_options, ) -from deltalake2db.duckdb import apply_storage_options_fsspec -from deltalake2db.azure_helper import get_account_name_from_path from typing import Optional @@ -18,11 +16,15 @@ _global_duck_con: Optional[duckdb.DuckDBPyConnection] = None _global_duck_meta_engine: Optional[DuckDBMetaEngine] = None +DELTA_META_ENGINE = os.getenv("DELTA_META_ENGINE") + def get_deltalake_meta(use_polars: bool, uri: SourceUri): global _global_duck_con global _global_duck_meta_engine - if use_polars: + if ( + use_polars and not DELTA_META_ENGINE == "duckdb" + ) or DELTA_META_ENGINE == "polars": ab_uri, ab_opts = uri.get_uri_options(flavor="object_store") meta_engine = PolarsMetaEngine() diff --git a/pyproject.toml b/pyproject.toml index 8ee8277..72202c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bmsdna-lakeapi" -version = "0.27.3" +version = "0.27.4" description = "" authors = [{ name = "DWH Team", email = "you@example.com" }] dependencies = [ diff --git a/tests/test_app.py b/tests/test_app.py index 41f2d34..99e201c 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,3 +1,4 @@ +import os from fastapi.testclient import TestClient import sys import polars as pl @@ -468,6 +469,8 @@ def test_fake_arrow(client: TestClient): def test_all_metadata(client: TestClient): + no_sql = os.getenv("NO_SQL_SERVER", "0") == "1" + response = client.get("/metadata") assert response.status_code == 200 jsd = response.json() @@ -476,6 +479,8 @@ def test_all_metadata(client: TestClient): name = item["name"] tag = item["tag"] route = item["route"] + if tag == "mssql" and no_sql: + continue meta_detail_route = route + f"/metadata_detail?%24engine={e}" print(meta_detail_route) response = client.get(meta_detail_route) diff --git a/tests/test_mssql.py b/tests/test_mssql.py index e4daf9b..c1e794b 100644 --- a/tests/test_mssql.py +++ b/tests/test_mssql.py @@ -1,6 +1,11 @@ from fastapi.testclient import TestClient +import pytest +import os +no_sql = os.getenv("NO_SQL_SERVER", "0") == "1" + +@pytest.mark.skipif(no_sql, reason="No SQL Server available") def test_simple_department(client: TestClient): response = client.get( "/api/v1/mssql/mssql_department?format=json&limit=50", @@ -10,6 +15,7 @@ def test_simple_department(client: TestClient): assert len(departments) == 16 +@pytest.mark.skipif(no_sql, reason="No SQL Server available") def test_filter_group_name(client: TestClient): response = client.get( "/api/v1/mssql/mssql_department?format=json&limit=100&GroupName=Research%20and%20Development", @@ -19,6 +25,7 @@ def test_filter_group_name(client: TestClient): assert len(tables) == 3 +@pytest.mark.skipif(no_sql, reason="No SQL Server available") def test_filter_offset(client: TestClient): response = client.get( "/api/v1/mssql/mssql_department?format=json&limit=100&offset=10", @@ -28,6 +35,7 @@ def test_filter_offset(client: TestClient): assert len(tables) == 6 +@pytest.mark.skipif(no_sql, reason="No SQL Server available") def test_metadata_detail(client: TestClient): response = client.get( "/api/v1/mssql/mssql_department/metadata_detail", diff --git a/uv.lock b/uv.lock index 0fa6f84..bc577e1 100644 --- a/uv.lock +++ b/uv.lock @@ -368,7 +368,7 @@ wheels = [ [[package]] name = "bmsdna-lakeapi" -version = "0.27.2" +version = "0.27.3" source = { editable = "." } dependencies = [ { name = "adlfs" },