From ae073164ce00e0dd9e24522de2af9fec442afd60 Mon Sep 17 00:00:00 2001 From: Shivam Date: Tue, 26 Aug 2025 20:55:49 -0700 Subject: [PATCH 1/3] Add support for DuckDB MAP functions - Add map_extract, map_keys, map_values, cardinality, element_at, map_concat, map_contains, map_contains_entry, map_contains_value, map_entries, map_extract_value, and map_from_entries functions - Create migration file pg_duckdb--1.0.0--1.1.0.sql with function definitions - Update metadata cache to recognize all MAP functions as DuckDB-only - Add comprehensive tests covering all functions with edge cases - Update documentation with detailed usage examples and function descriptions - Bump version to 1.1.0 in pg_duckdb.control - Add regression tests to schedule Closes #774 --- docs/functions.md | 120 +++++++++++++++ sql/pg_duckdb--1.0.0--1.1.0.sql | 82 ++++++++++ src/pgduckdb_metadata_cache.cpp | 14 +- test/regression/expected/map_functions.out | 66 +++++++++ .../expected/map_functions_comprehensive.out | 140 ++++++++++++++++++ test/regression/schedule | 2 + test/regression/sql/map_functions.sql | 20 +++ .../sql/map_functions_comprehensive.sql | 36 +++++ 8 files changed, 479 insertions(+), 1 deletion(-) create mode 100644 test/regression/expected/map_functions.out create mode 100644 test/regression/expected/map_functions_comprehensive.out create mode 100644 test/regression/sql/map_functions.sql create mode 100644 test/regression/sql/map_functions_comprehensive.sql diff --git a/docs/functions.md b/docs/functions.md index 7a92fcb8..e6376c73 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -27,6 +27,25 @@ All of the DuckDB [json functions and aggregates](https://duckdb.org/docs/data/j | [`union_extract`](#union_extract) | Extracts a value from a union type by tag name. | | [`union_tag`](#union_tag) | Gets the tag name of the active member in a union type. | +## MAP Functions + +All of the DuckDB [map functions](https://duckdb.org/docs/sql/data_types/map.html#map-functions). + +| Name | Description | +| :--- | :---------- | +| [`cardinality`](#cardinality) | Return the size of the map | +| [`element_at`](#element_at) | Return the value for a given key as a list | +| [`map_concat`](#map_concat) | Merge multiple maps | +| [`map_contains`](#map_contains) | Check if a map contains a given key | +| [`map_contains_entry`](#map_contains_entry) | Check if a map contains a given key-value pair | +| [`map_contains_value`](#map_contains_value) | Check if a map contains a given value | +| [`map_entries`](#map_entries) | Return a list of struct(k, v) for each key-value pair | +| [`map_extract`](#map_extract) | Extract a value from a map using a key | +| [`map_extract_value`](#map_extract_value) | Return the value for a given key or NULL | +| [`map_from_entries`](#map_from_entries) | Create a map from an array of struct(k, v) | +| [`map_keys`](#map_keys) | Get all keys from a map as a list | +| [`map_values`](#map_values) | Get all values from a map as a list | + ## Aggregates |Name|Description| @@ -664,6 +683,14 @@ FROM events; -- Filter using epoch time SELECT * FROM events WHERE epoch(created_at) > 1640995200; -- After 2022-01-01 + +#### `map_extract(map_col duckdb.map, key TEXT) -> duckdb.unresolved_type` + +Extracts a value from a map using the specified key. If the key doesn't exist, returns NULL. + +```sql +SELECT map_extract(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns 1 +SELECT map_extract(MAP(['a', 'b'], [1, 2]), 'c'); -- Returns NULL ``` ##### Required Arguments @@ -686,6 +713,16 @@ SELECT epoch_ms(reading_time) AS timestamp_ms, value FROM sensor_readings; + +| map_col | duckdb.map | The map to extract from | +| key | text | The key to look up in the map | + +#### `map_keys(map_col duckdb.map) -> duckdb.unresolved_type` + +Returns all keys from a map as a list. + +```sql +SELECT map_keys(MAP(['a', 'b', 'c'], [1, 2, 3])); -- Returns ['a', 'b', 'c'] ``` ##### Required Arguments @@ -707,6 +744,15 @@ SELECT event_id, epoch_ms(timestamp_ms) AS event_time FROM events; + +| map_col | duckdb.map | The map to extract keys from | + +#### `map_values(map_col duckdb.map) -> duckdb.unresolved_type` + +Returns all values from a map as a list. + +```sql +SELECT map_values(MAP(['a', 'b', 'c'], [1, 2, 3])); -- Returns [1, 2, 3] ``` ##### Required Arguments @@ -904,3 +950,77 @@ FROM orders; | Name | Type | Description | | :--- | :--- | :---------- | | expression | any | The expression to count distinct values for | + +| map_col | duckdb.map | The map to extract values from | + +#### `cardinality(map_col duckdb.map) -> integer` + +Returns the size of the map. + +```sql +SELECT cardinality(MAP(['a', 'b', 'c'], [1, 2, 3])); -- Returns 3 +``` + +#### `element_at(map_col duckdb.map, key TEXT) -> duckdb.unresolved_type` + +Returns the value for a given key as a list. Alias for `map_extract`. + +```sql +SELECT element_at(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns [1] +``` + +#### `map_concat(map_col duckdb.map, map_col2 duckdb.map) -> duckdb.unresolved_type` + +Merges multiple maps. On key collision, the value is taken from the last map. + +```sql +SELECT map_concat(MAP(['a', 'b'], [1, 2]), MAP(['b', 'c'], [3, 4])); -- Returns {'a': 1, 'b': 3, 'c': 4} +``` + +#### `map_contains(map_col duckdb.map, key TEXT) -> boolean` + +Checks if a map contains a given key. + +```sql +SELECT map_contains(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns true +``` + +#### `map_contains_entry(map_col duckdb.map, key TEXT, value TEXT) -> boolean` + +Checks if a map contains a given key-value pair. + +```sql +SELECT map_contains_entry(MAP(['a', 'b'], [1, 2]), 'a', '1'); -- Returns true +``` + +#### `map_contains_value(map_col duckdb.map, value TEXT) -> boolean` + +Checks if a map contains a given value. + +```sql +SELECT map_contains_value(MAP(['a', 'b'], [1, 2]), '1'); -- Returns true +``` + +#### `map_entries(map_col duckdb.map) -> duckdb.unresolved_type` + +Returns a list of struct(k, v) for each key-value pair in the map. + +```sql +SELECT map_entries(MAP(['a', 'b'], [1, 2])); -- Returns [{'key': 'a', 'value': 1}, {'key': 'b', 'value': 2}] +``` + +#### `map_extract_value(map_col duckdb.map, key TEXT) -> duckdb.unresolved_type` + +Returns the value for a given key or NULL if the key is not contained in the map. + +```sql +SELECT map_extract_value(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns 1 +``` + +#### `map_from_entries(entries duckdb.unresolved_type) -> duckdb.unresolved_type` + +Creates a map from an array of struct(k, v). + +```sql +SELECT map_from_entries([{'k': 'a', 'v': 1}, {'k': 'b', 'v': 2}]); -- Returns {'a': 1, 'b': 2} +``` diff --git a/sql/pg_duckdb--1.0.0--1.1.0.sql b/sql/pg_duckdb--1.0.0--1.1.0.sql index e69de29b..746ed2e1 100644 --- a/sql/pg_duckdb--1.0.0--1.1.0.sql +++ b/sql/pg_duckdb--1.0.0--1.1.0.sql @@ -0,0 +1,82 @@ +-- Add MAP functions support +-- Extract value from map using key +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.map, key text) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.unresolved_type, key text) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Get all keys from map +CREATE FUNCTION @extschema@.map_keys(map_col duckdb.map) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_keys(map_col duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Get all values from map +CREATE FUNCTION @extschema@.map_values(map_col duckdb.map) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_values(map_col duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Get map size +CREATE FUNCTION @extschema@.cardinality(map_col duckdb.map) +RETURNS integer AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.cardinality(map_col duckdb.unresolved_type) +RETURNS integer AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Get element at key (alias for map_extract) +CREATE FUNCTION @extschema@.element_at(map_col duckdb.map, key text) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.element_at(map_col duckdb.unresolved_type, key text) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.element_at(map_col duckdb.map, key integer) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.element_at(map_col duckdb.unresolved_type, key integer) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Merge multiple maps +CREATE FUNCTION @extschema@.map_concat(map_col duckdb.map, map_col2 duckdb.map) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_concat(map_col duckdb.unresolved_type, map_col2 duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Check if map contains key +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.map, key text) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.unresolved_type, key text) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.map, key integer) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.unresolved_type, key integer) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Check if map contains key-value pair +CREATE FUNCTION @extschema@.map_contains_entry(map_col duckdb.map, key text, value text) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_entry(map_col duckdb.unresolved_type, key text, value text) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Check if map contains value +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.map, value text) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.unresolved_type, value text) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Get all key-value pairs as structs +CREATE FUNCTION @extschema@.map_entries(map_col duckdb.map) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_entries(map_col duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Extract single value from map (not as list) +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.map, key text) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.unresolved_type, key text) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.map, key integer) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.unresolved_type, key integer) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +-- Create map from array of struct(k, v) +CREATE FUNCTION @extschema@.map_from_entries(entries duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; diff --git a/src/pgduckdb_metadata_cache.cpp b/src/pgduckdb_metadata_cache.cpp index 0492b879..b11543da 100644 --- a/src/pgduckdb_metadata_cache.cpp +++ b/src/pgduckdb_metadata_cache.cpp @@ -183,7 +183,19 @@ BuildDuckdbOnlyFunctions() { "make_timestamptz", "time_bucket", "union_extract", - "union_tag"}; + "union_tag", + "cardinality", + "element_at", + "map_concat", + "map_contains", + "map_contains_entry", + "map_contains_value", + "map_entries", + "map_extract", + "map_extract_value", + "map_from_entries", + "map_keys", + "map_values"}; for (uint32_t i = 0; i < lengthof(function_names); i++) { CatCList *catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(function_names[i])); diff --git a/test/regression/expected/map_functions.out b/test/regression/expected/map_functions.out new file mode 100644 index 00000000..e5f397cd --- /dev/null +++ b/test/regression/expected/map_functions.out @@ -0,0 +1,66 @@ +-- Test MAP functions +-- These tests verify that the MAP functions are properly exposed and working +-- Test map_extract function +SELECT map_extract(r['map_col'], 'a') as extracted_value FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; + extracted_value +----------------- + {1} +(1 row) + +SELECT map_extract(r['map_col'], 'd') as missing_key FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; + missing_key +------------- + {} +(1 row) + +SELECT map_extract(r['map_col'], 'key1') as string_value FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + string_value +-------------- + {value1} +(1 row) + +-- Test map_keys function +SELECT map_keys(r['map_col']) as keys FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; + keys +--------- + {a,b,c} +(1 row) + +SELECT map_keys(r['map_col']) as empty_keys FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + empty_keys +------------ + {} +(1 row) + +-- Test map_values function +SELECT map_values(r['map_col']) as values FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; + values +--------- + {1,2,3} +(1 row) + +SELECT map_values(r['map_col']) as empty_values FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + empty_values +-------------- + {} +(1 row) + +-- Test with unresolved_type for flexibility +SELECT map_extract(r['map_col']::duckdb.unresolved_type, 'x') as flexible_extract FROM duckdb.query($$ SELECT MAP(['x', 'y'], [10, 20]) as map_col $$) r; + flexible_extract +------------------ + {10} +(1 row) + +SELECT map_keys(r['map_col']::duckdb.unresolved_type) as flexible_keys FROM duckdb.query($$ SELECT MAP(['p', 'q'], [100, 200]) as map_col $$) r; + flexible_keys +--------------- + {p,q} +(1 row) + +SELECT map_values(r['map_col']::duckdb.unresolved_type) as flexible_values FROM duckdb.query($$ SELECT MAP(['m', 'n'], [1000, 2000]) as map_col $$) r; + flexible_values +----------------- + {1000,2000} +(1 row) + diff --git a/test/regression/expected/map_functions_comprehensive.out b/test/regression/expected/map_functions_comprehensive.out new file mode 100644 index 00000000..9dee1b83 --- /dev/null +++ b/test/regression/expected/map_functions_comprehensive.out @@ -0,0 +1,140 @@ +-- Test MAP functions +SET duckdb.force_execution = false; +-- Basic functions +SELECT cardinality(r['map_col']) as cardinality_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2', 'key3'], ['value1', 'value2', 'value3']) as map_col $$) r; + cardinality_result +-------------------- + 3 +(1 row) + +SELECT element_at(r['map_col'], 'key1') as element_at_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + element_at_result +------------------- + {value1} +(1 row) + +SELECT map_concat(r1['map1'], r2['map2']) as map_concat_result FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map1 $$) r1, duckdb.query($$ SELECT MAP(['b', 'c'], [3, 4]) as map2 $$) r2; + map_concat_result +------------------- + {a=1, b=3, c=4} +(1 row) + +-- Contains functions +SELECT map_contains(r['map_col'], 'key1') as contains_key_true FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + contains_key_true +------------------- + t +(1 row) + +SELECT map_contains(r['map_col'], 'key3') as contains_key_false FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + contains_key_false +-------------------- + f +(1 row) + +SELECT map_contains_entry(r['map_col'], 'key1', 'value1') as contains_entry_true FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + contains_entry_true +--------------------- + t +(1 row) + +SELECT map_contains_entry(r['map_col'], 'key1', 'value2') as contains_entry_false FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + contains_entry_false +---------------------- + f +(1 row) + +SELECT map_contains_value(r['map_col'], 'value1') as contains_value_true FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + contains_value_true +--------------------- + t +(1 row) + +SELECT map_contains_value(r['map_col'], 'value3') as contains_value_false FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + contains_value_false +---------------------- + f +(1 row) + +-- Extract functions +SELECT map_extract(r['map_col'], 'key1') as map_extract_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + map_extract_result +-------------------- + {value1} +(1 row) + +SELECT map_extract_value(r['map_col'], 'key1') as map_extract_value_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + map_extract_value_result +-------------------------- + value1 +(1 row) + +SELECT map_extract_value(r['map_col'], 'key3') as map_extract_value_null FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + map_extract_value_null +------------------------ + +(1 row) + +-- Keys and values +SELECT map_from_entries(r['entries']) as map_from_entries_result FROM duckdb.query($$ SELECT [{'k': 'a', 'v': 1}, {'k': 'b', 'v': 2}] as entries $$) r; + map_from_entries_result +------------------------- + {a=1, b=2} +(1 row) + +SELECT map_keys(r['map_col']) as map_keys_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2', 'key3'], ['value1', 'value2', 'value3']) as map_col $$) r; + map_keys_result +------------------ + {key1,key2,key3} +(1 row) + +SELECT map_values(r['map_col']) as map_values_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2', 'key3'], ['value1', 'value2', 'value3']) as map_col $$) r; + map_values_result +------------------------ + {value1,value2,value3} +(1 row) + +-- Different data types +SELECT cardinality(r['map_col']) as cardinality_numeric_keys FROM duckdb.query($$ SELECT MAP([1, 2, 3], ['a', 'b', 'c']) as map_col $$) r; + cardinality_numeric_keys +-------------------------- + 3 +(1 row) + +SELECT map_contains(r['map_col'], 1) as contains_numeric_key FROM duckdb.query($$ SELECT MAP([1, 2, 3], ['a', 'b', 'c']) as map_col $$) r; + contains_numeric_key +---------------------- + t +(1 row) + +-- Edge cases +SELECT cardinality(r['map_col']) as cardinality_empty FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + cardinality_empty +------------------- + 0 +(1 row) + +SELECT map_keys(r['map_col']) as keys_empty FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + keys_empty +------------ + {} +(1 row) + +SELECT map_values(r['map_col']) as values_empty FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + values_empty +-------------- + {} +(1 row) + +SELECT map_extract(r['map_col'], 'non_existent') as extract_non_existent FROM duckdb.query($$ SELECT MAP(['key1'], ['value1']) as map_col $$) r; + extract_non_existent +---------------------- + {} +(1 row) + +SELECT map_contains(r['map_col'], 'non_existent') as contains_non_existent FROM duckdb.query($$ SELECT MAP(['key1'], ['value1']) as map_col $$) r; + contains_non_existent +----------------------- + f +(1 row) + diff --git a/test/regression/schedule b/test/regression/schedule index 7f646e48..32a120cd 100644 --- a/test/regression/schedule +++ b/test/regression/schedule @@ -52,6 +52,8 @@ test: transaction_isolation test: transactions test: type_support test: union_functions +test: map_functions +test: map_functions_comprehensive test: unresolved_type test: views test: parallel_postgres_scan diff --git a/test/regression/sql/map_functions.sql b/test/regression/sql/map_functions.sql new file mode 100644 index 00000000..f45e3b08 --- /dev/null +++ b/test/regression/sql/map_functions.sql @@ -0,0 +1,20 @@ +-- Test MAP functions +-- These tests verify that the MAP functions are properly exposed and working + +-- Test map_extract function +SELECT map_extract(r['map_col'], 'a') as extracted_value FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +SELECT map_extract(r['map_col'], 'd') as missing_key FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +SELECT map_extract(r['map_col'], 'key1') as string_value FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + +-- Test map_keys function +SELECT map_keys(r['map_col']) as keys FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +SELECT map_keys(r['map_col']) as empty_keys FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + +-- Test map_values function +SELECT map_values(r['map_col']) as values FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +SELECT map_values(r['map_col']) as empty_values FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; + +-- Test with unresolved_type for flexibility +SELECT map_extract(r['map_col']::duckdb.unresolved_type, 'x') as flexible_extract FROM duckdb.query($$ SELECT MAP(['x', 'y'], [10, 20]) as map_col $$) r; +SELECT map_keys(r['map_col']::duckdb.unresolved_type) as flexible_keys FROM duckdb.query($$ SELECT MAP(['p', 'q'], [100, 200]) as map_col $$) r; +SELECT map_values(r['map_col']::duckdb.unresolved_type) as flexible_values FROM duckdb.query($$ SELECT MAP(['m', 'n'], [1000, 2000]) as map_col $$) r; diff --git a/test/regression/sql/map_functions_comprehensive.sql b/test/regression/sql/map_functions_comprehensive.sql new file mode 100644 index 00000000..a5a3b130 --- /dev/null +++ b/test/regression/sql/map_functions_comprehensive.sql @@ -0,0 +1,36 @@ +-- Test MAP functions +SET duckdb.force_execution = false; + +-- Basic functions +SELECT cardinality(r['map_col']) as cardinality_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2', 'key3'], ['value1', 'value2', 'value3']) as map_col $$) r; +SELECT element_at(r['map_col'], 'key1') as element_at_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_concat(r1['map1'], r2['map2']) as map_concat_result FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map1 $$) r1, duckdb.query($$ SELECT MAP(['b', 'c'], [3, 4]) as map2 $$) r2; + +-- Contains functions +SELECT map_contains(r['map_col'], 'key1') as contains_key_true FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_contains(r['map_col'], 'key3') as contains_key_false FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_contains_entry(r['map_col'], 'key1', 'value1') as contains_entry_true FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_contains_entry(r['map_col'], 'key1', 'value2') as contains_entry_false FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_contains_value(r['map_col'], 'value1') as contains_value_true FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_contains_value(r['map_col'], 'value3') as contains_value_false FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + +-- Extract functions +SELECT map_extract(r['map_col'], 'key1') as map_extract_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_extract_value(r['map_col'], 'key1') as map_extract_value_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; +SELECT map_extract_value(r['map_col'], 'key3') as map_extract_value_null FROM duckdb.query($$ SELECT MAP(['key1', 'key2'], ['value1', 'value2']) as map_col $$) r; + +-- Keys and values +SELECT map_from_entries(r['entries']) as map_from_entries_result FROM duckdb.query($$ SELECT [{'k': 'a', 'v': 1}, {'k': 'b', 'v': 2}] as entries $$) r; +SELECT map_keys(r['map_col']) as map_keys_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2', 'key3'], ['value1', 'value2', 'value3']) as map_col $$) r; +SELECT map_values(r['map_col']) as map_values_result FROM duckdb.query($$ SELECT MAP(['key1', 'key2', 'key3'], ['value1', 'value2', 'value3']) as map_col $$) r; + +-- Different data types +SELECT cardinality(r['map_col']) as cardinality_numeric_keys FROM duckdb.query($$ SELECT MAP([1, 2, 3], ['a', 'b', 'c']) as map_col $$) r; +SELECT map_contains(r['map_col'], 1) as contains_numeric_key FROM duckdb.query($$ SELECT MAP([1, 2, 3], ['a', 'b', 'c']) as map_col $$) r; + +-- Edge cases +SELECT cardinality(r['map_col']) as cardinality_empty FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; +SELECT map_keys(r['map_col']) as keys_empty FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; +SELECT map_values(r['map_col']) as values_empty FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; +SELECT map_extract(r['map_col'], 'non_existent') as extract_non_existent FROM duckdb.query($$ SELECT MAP(['key1'], ['value1']) as map_col $$) r; +SELECT map_contains(r['map_col'], 'non_existent') as contains_non_existent FROM duckdb.query($$ SELECT MAP(['key1'], ['value1']) as map_col $$) r; From e3f6a7b5fd4790bb86e7262dbc66ef19e03f4fe2 Mon Sep 17 00:00:00 2001 From: Shivam Date: Wed, 8 Oct 2025 19:31:57 -0700 Subject: [PATCH 2/3] Fix MAP functions per review feedback Use duckdb.unresolved_type with explicit overloads for type flexibility, fix return types, update docs to use duckdb.query() syntax. Closes #774 --- docs/functions.md | 356 +++++++++++++------------------- sql/pg_duckdb--1.0.0--1.1.0.sql | 72 ++++++- 2 files changed, 208 insertions(+), 220 deletions(-) diff --git a/docs/functions.md b/docs/functions.md index e6376c73..8beae881 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -684,343 +684,273 @@ FROM events; SELECT * FROM events WHERE epoch(created_at) > 1640995200; -- After 2022-01-01 -#### `map_extract(map_col duckdb.map, key TEXT) -> duckdb.unresolved_type` +#### `map_extract(map_col duckdb.map, key duckdb.unresolved_type) -> duckdb.unresolved_type` -Extracts a value from a map using the specified key. If the key doesn't exist, returns NULL. +Extracts a value from a map using the specified key. If the key doesn't exist, returns an empty array. ```sql -SELECT map_extract(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns 1 -SELECT map_extract(MAP(['a', 'b'], [1, 2]), 'c'); -- Returns NULL +-- Extract value from a map +SELECT map_extract(r['map_col'], 'a') as value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: {1} + +-- Extract non-existent key +SELECT map_extract(r['map_col'], 'c') as value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: {} ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| timestamp_expr | timestamp | The timestamp to convert to epoch seconds | - -#### `epoch_ms(timestamp_expr)` -> `BIGINT` - -Converts timestamps to Unix epoch milliseconds. - -```sql --- High-precision timestamp for JavaScript -SELECT epoch_ms(NOW()) AS timestamp_ms; - --- For time-series data -SELECT - sensor_id, - epoch_ms(reading_time) AS timestamp_ms, - value -FROM sensor_readings; - | map_col | duckdb.map | The map to extract from | -| key | text | The key to look up in the map | +| key | duckdb.unresolved_type | The key to look up in the map | #### `map_keys(map_col duckdb.map) -> duckdb.unresolved_type` -Returns all keys from a map as a list. +Returns all keys from a map as an array. ```sql -SELECT map_keys(MAP(['a', 'b', 'c'], [1, 2, 3])); -- Returns ['a', 'b', 'c'] +-- Get all keys from a map +SELECT map_keys(r['map_col']) as keys +FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +-- Returns: {a,b,c} + +-- Empty map +SELECT map_keys(r['map_col']) as keys +FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; +-- Returns: {} ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| timestamp_expr | timestamp | The timestamp to convert to epoch milliseconds | - -#### `epoch_ms(milliseconds)` -> `TIMESTAMP` - -Converts Unix epoch milliseconds to a timestamp. This is the inverse of the above function. - -```sql --- Convert epoch milliseconds to timestamp -SELECT epoch_ms(1640995200000) AS timestamp_from_ms; -- 2022-01-01 00:00:00 - --- Convert stored milliseconds back to timestamps -SELECT - event_id, - epoch_ms(timestamp_ms) AS event_time -FROM events; - | map_col | duckdb.map | The map to extract keys from | #### `map_values(map_col duckdb.map) -> duckdb.unresolved_type` -Returns all values from a map as a list. +Returns all values from a map as an array. ```sql -SELECT map_values(MAP(['a', 'b', 'c'], [1, 2, 3])); -- Returns [1, 2, 3] +-- Get all values from a map +SELECT map_values(r['map_col']) as values +FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +-- Returns: {1,2,3} + +-- Empty map +SELECT map_values(r['map_col']) as values +FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; +-- Returns: {} ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| milliseconds | bigint | Milliseconds since Unix epoch | +| map_col | duckdb.map | The map to extract values from | -#### `epoch_us(timestamp_expr)` -> `BIGINT` +#### `cardinality(map_col duckdb.map) -> numeric` -Converts timestamps to Unix epoch microseconds. +Returns the size of the map (number of key-value pairs). ```sql --- Microsecond precision timestamps -SELECT epoch_us(NOW()) AS timestamp_us; +-- Get the number of entries in a map +SELECT cardinality(r['map_col']) as size +FROM duckdb.query($$ SELECT MAP(['a', 'b', 'c'], [1, 2, 3]) as map_col $$) r; +-- Returns: 3 + +-- Empty map +SELECT cardinality(r['map_col']) as size +FROM duckdb.query($$ SELECT MAP([], []) as map_col $$) r; +-- Returns: 0 ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| timestamp_expr | timestamp | The timestamp to convert to epoch microseconds | +| map_col | duckdb.map | The map to get the size of | -#### `epoch_ns(timestamp_expr)` -> `BIGINT` +#### `element_at(map_col duckdb.map, key duckdb.unresolved_type) -> duckdb.unresolved_type` -Converts timestamps to Unix epoch nanoseconds. +Returns the value for a given key as an array. ```sql --- Nanosecond precision timestamps -SELECT epoch_ns(NOW()) AS timestamp_ns; +-- Get value for a specific key +SELECT element_at(r['map_col'], 'a') as value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: {1} + +-- Non-existent key +SELECT element_at(r['map_col'], 'c') as value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: {} ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| timestamp_expr | timestamp | The timestamp to convert to epoch nanoseconds | +| map_col | duckdb.map | The map to extract from | +| key | duckdb.unresolved_type | The key to look up in the map | -#### `make_timestamp(microseconds)` -> `TIMESTAMP` +#### `map_concat(map_col duckdb.map, map_col2 duckdb.map) -> duckdb.map` -Creates a timestamp from microseconds since Unix epoch (1970-01-01 00:00:00 UTC). +Merges multiple maps. On key collision, the value is taken from the last map. ```sql --- Create timestamp from current epoch microseconds -SELECT make_timestamp(epoch_us(NOW())) AS reconstructed_timestamp; +-- Merge two maps +SELECT map_concat(r1['map1'], r2['map2']) as merged +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map1 $$) r1, + duckdb.query($$ SELECT MAP(['b', 'c'], [3, 4]) as map2 $$) r2; +-- Returns: {a=1, b=3, c=4} --- Create specific timestamps -SELECT make_timestamp(1640995200000000) AS new_years_2022; -- 2022-01-01 00:00:00 +-- Note: 'b' value from map2 (3) overwrites map1's value (2) ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| microseconds | bigint | Microseconds since Unix epoch | +| map_col | duckdb.map | The first map | +| map_col2 | duckdb.map | The second map to merge | -#### `make_timestamptz(microseconds)` -> `TIMESTAMPTZ` +#### `map_contains(map_col duckdb.map, key duckdb.unresolved_type) -> boolean` -Creates a timestamp with timezone from microseconds since Unix epoch. +Checks if a map contains a given key. ```sql --- Create timestamptz from current epoch microseconds -SELECT make_timestamptz(epoch_us(NOW())) AS reconstructed_timestamptz; +-- Check if key exists +SELECT map_contains(r['map_col'], 'a') as has_key +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: t (true) --- Create specific timestamptz -SELECT make_timestamptz(1640995200000000) AS new_years_2022_tz; +-- Check for non-existent key +SELECT map_contains(r['map_col'], 'c') as has_key +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: f (false) ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| microseconds | bigint | Microseconds since Unix epoch | - -#### `TABLESAMPLE (sampling_method(percentage | rows))` - -Samples a subset of rows from a table or query result. This is useful for analyzing large datasets by working with representative samples, improving query performance for exploratory data analysis. - -```sql --- Sample 10% of rows from a table -SELECT * FROM large_table TABLESAMPLE SYSTEM(10); - --- Sample approximately 1000 rows -SELECT * FROM events TABLESAMPLE SYSTEM(1000 ROWS); +| map_col | duckdb.map | The map to check | +| key | duckdb.unresolved_type | The key to search for | --- Sample from data lake files -SELECT * FROM read_parquet('s3://datalake/**/*.parquet') TABLESAMPLE SYSTEM(5); +#### `map_contains_entry(map_col duckdb.map, key duckdb.unresolved_type, value duckdb.unresolved_type) -> boolean` --- Use sampling for quick data profiling -SELECT - region, - COUNT(*) as sample_count, - AVG(revenue) as avg_revenue -FROM sales_data TABLESAMPLE SYSTEM(2) -GROUP BY region; - --- Sample from joins for performance -SELECT c.name, COUNT(o.id) as order_count -FROM customers c -JOIN orders o TABLESAMPLE SYSTEM(10) ON c.id = o.customer_id -GROUP BY c.name; -``` - -**Sampling Methods:** - -- **SYSTEM**: Random sampling at the storage level (faster, approximate percentage) -- **BERNOULLI**: Row-by-row random sampling (slower, exact percentage) +Checks if a map contains a given key-value pair. ```sql --- System sampling (recommended for large tables) -SELECT * FROM huge_table TABLESAMPLE SYSTEM(1); +-- Check if key-value pair exists +SELECT map_contains_entry(r['map_col'], 'a', 1) as has_entry +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: t (true) --- Bernoulli sampling (exact percentage) -SELECT * FROM medium_table TABLESAMPLE BERNOULLI(5); +-- Check with wrong value for existing key +SELECT map_contains_entry(r['map_col'], 'a', 2) as has_entry +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: f (false) ``` -**Use Cases:** - -- **Data exploration**: Quick analysis of large datasets -- **Performance testing**: Test queries on sample data -- **Data profiling**: Understand data distribution patterns -- **ETL development**: Develop pipelines on sample data -- **Quality checks**: Validate data quality on samples - -Further information: -* [DuckDB TABLESAMPLE documentation](https://duckdb.org/docs/sql/query_syntax/sample.html) - ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| sampling_method | keyword | Either `SYSTEM` or `BERNOULLI` | -| percentage | numeric | Percentage of rows to sample (0-100) | - -##### Optional Arguments - -| Name | Type | Description | -| :--- | :--- | :---------- | -| rows | integer | Approximate number of rows to sample (use with `ROWS` keyword) | +| map_col | duckdb.map | The map to check | +| key | duckdb.unresolved_type | The key to search for | +| value | duckdb.unresolved_type | The value to match with the key | -#### `union_extract(union_col, tag)` -> `duckdb.unresolved_type` +#### `map_contains_value(map_col duckdb.map, value duckdb.unresolved_type) -> boolean` -Extracts a value from a union type by specifying the tag name of the member you want to access. +Checks if a map contains a given value. ```sql --- Extract the string value if the union contains a string -SELECT union_extract(my_union_column, 'string') FROM my_table; +-- Check if value exists +SELECT map_contains_value(r['map_col'], 1) as has_value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: t (true) --- Extract integer value from union -SELECT union_extract(data_field, 'integer') AS extracted_int FROM mixed_data; +-- Check for non-existent value +SELECT map_contains_value(r['map_col'], 3) as has_value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: f (false) ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| union_col | duckdb.union or duckdb.unresolved_type | The union column to extract from | -| tag | text | The tag name of the union member to extract | +| map_col | duckdb.map | The map to check | +| value | duckdb.unresolved_type | The value to search for | -#### `union_tag(union_col)` -> `duckdb.unresolved_type` +#### `map_entries(map_col duckdb.map) -> duckdb.struct[]` -Returns the tag name of the currently active member in a union type. +Returns an array of struct(key, value) for each key-value pair in the map. ```sql --- Get the active tag for each row -SELECT union_tag(my_union_column) AS active_type FROM my_table; +-- Get all key-value pairs as structs +SELECT map_entries(r['map_col']) as entries +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: {"(a,1)","(b,2)"} --- Filter rows based on union tag -SELECT * FROM my_table WHERE union_tag(data_field) = 'string'; +-- Access individual struct fields +SELECT unnest(map_entries(r['map_col'])) as entry +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| union_col | duckdb.union or duckdb.unresolved_type | The union column to get the tag from | +| map_col | duckdb.map | The map to extract entries from | -#### `approx_count_distinct(expression)` -> `BIGINT` +#### `map_extract_value(map_col duckdb.map, key duckdb.unresolved_type) -> duckdb.unresolved_type` -Approximates the count of distinct elements using the HyperLogLog algorithm. This is much faster than `COUNT(DISTINCT ...)` for large datasets, with a small error rate. +Returns the value for a given key or NULL if the key is not contained in the map. ```sql --- Approximate distinct count of customer IDs -SELECT approx_count_distinct(customer_id) FROM orders; +-- Extract single value (not as array) +SELECT map_extract_value(r['map_col'], 'a') as value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: 1 --- Compare with exact count -SELECT - approx_count_distinct(customer_id) AS approx_distinct, - COUNT(DISTINCT customer_id) AS exact_distinct -FROM orders; +-- Non-existent key returns NULL +SELECT map_extract_value(r['map_col'], 'c') as value +FROM duckdb.query($$ SELECT MAP(['a', 'b'], [1, 2]) as map_col $$) r; +-- Returns: NULL ``` ##### Required Arguments | Name | Type | Description | | :--- | :--- | :---------- | -| expression | any | The expression to count distinct values for | - -| map_col | duckdb.map | The map to extract values from | - -#### `cardinality(map_col duckdb.map) -> integer` - -Returns the size of the map. - -```sql -SELECT cardinality(MAP(['a', 'b', 'c'], [1, 2, 3])); -- Returns 3 -``` - -#### `element_at(map_col duckdb.map, key TEXT) -> duckdb.unresolved_type` - -Returns the value for a given key as a list. Alias for `map_extract`. - -```sql -SELECT element_at(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns [1] -``` - -#### `map_concat(map_col duckdb.map, map_col2 duckdb.map) -> duckdb.unresolved_type` - -Merges multiple maps. On key collision, the value is taken from the last map. - -```sql -SELECT map_concat(MAP(['a', 'b'], [1, 2]), MAP(['b', 'c'], [3, 4])); -- Returns {'a': 1, 'b': 3, 'c': 4} -``` - -#### `map_contains(map_col duckdb.map, key TEXT) -> boolean` - -Checks if a map contains a given key. - -```sql -SELECT map_contains(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns true -``` - -#### `map_contains_entry(map_col duckdb.map, key TEXT, value TEXT) -> boolean` - -Checks if a map contains a given key-value pair. - -```sql -SELECT map_contains_entry(MAP(['a', 'b'], [1, 2]), 'a', '1'); -- Returns true -``` - -#### `map_contains_value(map_col duckdb.map, value TEXT) -> boolean` - -Checks if a map contains a given value. - -```sql -SELECT map_contains_value(MAP(['a', 'b'], [1, 2]), '1'); -- Returns true -``` +| map_col | duckdb.map | The map to extract from | +| key | duckdb.unresolved_type | The key to look up in the map | -#### `map_entries(map_col duckdb.map) -> duckdb.unresolved_type` +#### `map_from_entries(entries duckdb.struct[]) -> duckdb.map` -Returns a list of struct(k, v) for each key-value pair in the map. +Creates a map from an array of struct(k, v). ```sql -SELECT map_entries(MAP(['a', 'b'], [1, 2])); -- Returns [{'key': 'a', 'value': 1}, {'key': 'b', 'value': 2}] -``` - -#### `map_extract_value(map_col duckdb.map, key TEXT) -> duckdb.unresolved_type` +-- Create map from array of structs +SELECT map_from_entries(r['entries']) as new_map +FROM duckdb.query($$ + SELECT [{'k': 'a', 'v': 1}, {'k': 'b', 'v': 2}] as entries +$$) r; +-- Returns: {a=1, b=2} -Returns the value for a given key or NULL if the key is not contained in the map. - -```sql -SELECT map_extract_value(MAP(['a', 'b'], [1, 2]), 'a'); -- Returns 1 +-- This is the inverse operation of map_entries +SELECT map_from_entries(map_entries(r['map_col'])) as reconstructed +FROM duckdb.query($$ SELECT MAP(['x', 'y'], [10, 20]) as map_col $$) r; +-- Returns: {x=10, y=20} ``` -#### `map_from_entries(entries duckdb.unresolved_type) -> duckdb.unresolved_type` - -Creates a map from an array of struct(k, v). +##### Required Arguments -```sql -SELECT map_from_entries([{'k': 'a', 'v': 1}, {'k': 'b', 'v': 2}]); -- Returns {'a': 1, 'b': 2} -``` +| Name | Type | Description | +| :--- | :--- | :---------- | +| entries | duckdb.struct[] | Array of structs with 'k' (key) and 'v' (value) fields | diff --git a/sql/pg_duckdb--1.0.0--1.1.0.sql b/sql/pg_duckdb--1.0.0--1.1.0.sql index 746ed2e1..5db78578 100644 --- a/sql/pg_duckdb--1.0.0--1.1.0.sql +++ b/sql/pg_duckdb--1.0.0--1.1.0.sql @@ -1,9 +1,21 @@ -- Add MAP functions support -- Extract value from map using key +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.map, key duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.unresolved_type, key duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_extract(map_col duckdb.map, key text) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_extract(map_col duckdb.unresolved_type, key text) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.map, key integer) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.unresolved_type, key integer) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.map, key bigint) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract(map_col duckdb.unresolved_type, key bigint) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Get all keys from map CREATE FUNCTION @extschema@.map_keys(map_col duckdb.map) @@ -19,11 +31,15 @@ RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANG -- Get map size CREATE FUNCTION @extschema@.cardinality(map_col duckdb.map) -RETURNS integer AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS numeric AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.cardinality(map_col duckdb.unresolved_type) -RETURNS integer AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS numeric AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Get element at key (alias for map_extract) +CREATE FUNCTION @extschema@.element_at(map_col duckdb.map, key duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.element_at(map_col duckdb.unresolved_type, key duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.element_at(map_col duckdb.map, key text) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.element_at(map_col duckdb.unresolved_type, key text) @@ -32,14 +48,26 @@ CREATE FUNCTION @extschema@.element_at(map_col duckdb.map, key integer) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.element_at(map_col duckdb.unresolved_type, key integer) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.element_at(map_col duckdb.map, key bigint) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.element_at(map_col duckdb.unresolved_type, key bigint) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Merge multiple maps CREATE FUNCTION @extschema@.map_concat(map_col duckdb.map, map_col2 duckdb.map) -RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS duckdb.map AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_concat(map_col duckdb.unresolved_type, map_col2 duckdb.map) +RETURNS duckdb.map AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_concat(map_col duckdb.map, map_col2 duckdb.unresolved_type) +RETURNS duckdb.map AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_concat(map_col duckdb.unresolved_type, map_col2 duckdb.unresolved_type) -RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS duckdb.map AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Check if map contains key +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.map, key duckdb.unresolved_type) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.unresolved_type, key duckdb.unresolved_type) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains(map_col duckdb.map, key text) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains(map_col duckdb.unresolved_type, key text) @@ -48,26 +76,50 @@ CREATE FUNCTION @extschema@.map_contains(map_col duckdb.map, key integer) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains(map_col duckdb.unresolved_type, key integer) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.map, key bigint) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains(map_col duckdb.unresolved_type, key bigint) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Check if map contains key-value pair +CREATE FUNCTION @extschema@.map_contains_entry(map_col duckdb.map, key duckdb.unresolved_type, value duckdb.unresolved_type) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_entry(map_col duckdb.unresolved_type, key duckdb.unresolved_type, value duckdb.unresolved_type) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains_entry(map_col duckdb.map, key text, value text) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains_entry(map_col duckdb.unresolved_type, key text, value text) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Check if map contains value +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.map, value duckdb.unresolved_type) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.unresolved_type, value duckdb.unresolved_type) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.map, value text) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.unresolved_type, value text) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.map, value integer) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.unresolved_type, value integer) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.map, value bigint) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_contains_value(map_col duckdb.unresolved_type, value bigint) +RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Get all key-value pairs as structs CREATE FUNCTION @extschema@.map_entries(map_col duckdb.map) -RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS duckdb.struct[] AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_entries(map_col duckdb.unresolved_type) -RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS duckdb.struct[] AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Extract single value from map (not as list) +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.map, key duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.unresolved_type, key duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.map, key text) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.unresolved_type, key text) @@ -76,7 +128,13 @@ CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.map, key integer) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.unresolved_type, key integer) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.map, key bigint) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +CREATE FUNCTION @extschema@.map_extract_value(map_col duckdb.unresolved_type, key bigint) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; -- Create map from array of struct(k, v) +CREATE FUNCTION @extschema@.map_from_entries(entries duckdb.struct[]) +RETURNS duckdb.map AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; CREATE FUNCTION @extschema@.map_from_entries(entries duckdb.unresolved_type) -RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; +RETURNS duckdb.map AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; From 281ff4437a18b1ec501afe8ae2520daf140cdf93 Mon Sep 17 00:00:00 2001 From: Shivam Date: Wed, 8 Oct 2025 22:44:06 -0700 Subject: [PATCH 3/3] Resolved merge conflicts Closes #774 --- docs/functions.md | 240 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) diff --git a/docs/functions.md b/docs/functions.md index 8beae881..aa8d728b 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -683,6 +683,13 @@ FROM events; -- Filter using epoch time SELECT * FROM events WHERE epoch(created_at) > 1640995200; -- After 2022-01-01 +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| timestamp_expr | timestamp | The timestamp to convert to epoch seconds | #### `map_extract(map_col duckdb.map, key duckdb.unresolved_type) -> duckdb.unresolved_type` @@ -954,3 +961,236 @@ FROM duckdb.query($$ SELECT MAP(['x', 'y'], [10, 20]) as map_col $$) r; | Name | Type | Description | | :--- | :--- | :---------- | | entries | duckdb.struct[] | Array of structs with 'k' (key) and 'v' (value) fields | + +#### `epoch_ms(timestamp_expr)` -> `BIGINT` + +Converts timestamps to Unix epoch milliseconds. + +```sql +-- High-precision timestamp for JavaScript +SELECT epoch_ms(NOW()) AS timestamp_ms; + +-- For time-series data +SELECT + sensor_id, + epoch_ms(reading_time) AS timestamp_ms, + value +FROM sensor_readings; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| timestamp_expr | timestamp | The timestamp to convert to epoch milliseconds | + +#### `epoch_ms(milliseconds)` -> `TIMESTAMP` + +Converts Unix epoch milliseconds to a timestamp. This is the inverse of the above function. + +```sql +-- Convert epoch milliseconds to timestamp +SELECT epoch_ms(1640995200000) AS timestamp_from_ms; -- 2022-01-01 00:00:00 + +-- Convert stored milliseconds back to timestamps +SELECT + event_id, + epoch_ms(timestamp_ms) AS event_time +FROM events; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| milliseconds | bigint | Milliseconds since Unix epoch | + +#### `epoch_us(timestamp_expr)` -> `BIGINT` + +Converts timestamps to Unix epoch microseconds. + +```sql +-- Microsecond precision timestamps +SELECT epoch_us(NOW()) AS timestamp_us; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| timestamp_expr | timestamp | The timestamp to convert to epoch microseconds | + +#### `epoch_ns(timestamp_expr)` -> `BIGINT` + +Converts timestamps to Unix epoch nanoseconds. + +```sql +-- Nanosecond precision timestamps +SELECT epoch_ns(NOW()) AS timestamp_ns; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| timestamp_expr | timestamp | The timestamp to convert to epoch nanoseconds | + +#### `make_timestamp(microseconds)` -> `TIMESTAMP` + +Creates a timestamp from microseconds since Unix epoch (1970-01-01 00:00:00 UTC). + +```sql +-- Create timestamp from current epoch microseconds +SELECT make_timestamp(epoch_us(NOW())) AS reconstructed_timestamp; + +-- Create specific timestamps +SELECT make_timestamp(1640995200000000) AS new_years_2022; -- 2022-01-01 00:00:00 +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| microseconds | bigint | Microseconds since Unix epoch | + +#### `make_timestamptz(microseconds)` -> `TIMESTAMPTZ` + +Creates a timestamp with timezone from microseconds since Unix epoch. + +```sql +-- Create timestamptz from current epoch microseconds +SELECT make_timestamptz(epoch_us(NOW())) AS reconstructed_timestamptz; + +-- Create specific timestamptz +SELECT make_timestamptz(1640995200000000) AS new_years_2022_tz; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| microseconds | bigint | Microseconds since Unix epoch | + +#### `TABLESAMPLE (sampling_method(percentage | rows))` + +Samples a subset of rows from a table or query result. This is useful for analyzing large datasets by working with representative samples, improving query performance for exploratory data analysis. + +```sql +-- Sample 10% of rows from a table +SELECT * FROM large_table TABLESAMPLE SYSTEM(10); + +-- Sample approximately 1000 rows +SELECT * FROM events TABLESAMPLE SYSTEM(1000 ROWS); + +-- Sample from data lake files +SELECT * FROM read_parquet('s3://datalake/**/*.parquet') TABLESAMPLE SYSTEM(5); + +-- Use sampling for quick data profiling +SELECT + region, + COUNT(*) as sample_count, + AVG(revenue) as avg_revenue +FROM sales_data TABLESAMPLE SYSTEM(2) +GROUP BY region; + +-- Sample from joins for performance +SELECT c.name, COUNT(o.id) as order_count +FROM customers c +JOIN orders o TABLESAMPLE SYSTEM(10) ON c.id = o.customer_id +GROUP BY c.name; +``` + +**Sampling Methods:** + +- **SYSTEM**: Random sampling at the storage level (faster, approximate percentage) +- **BERNOULLI**: Row-by-row random sampling (slower, exact percentage) + +```sql +-- System sampling (recommended for large tables) +SELECT * FROM huge_table TABLESAMPLE SYSTEM(1); + +-- Bernoulli sampling (exact percentage) +SELECT * FROM medium_table TABLESAMPLE BERNOULLI(5); +``` + +**Use Cases:** + +- **Data exploration**: Quick analysis of large datasets +- **Performance testing**: Test queries on sample data +- **Data profiling**: Understand data distribution patterns +- **ETL development**: Develop pipelines on sample data +- **Quality checks**: Validate data quality on samples + +Further information: +* [DuckDB TABLESAMPLE documentation](https://duckdb.org/docs/sql/query_syntax/sample.html) + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| sampling_method | keyword | Either `SYSTEM` or `BERNOULLI` | +| percentage | numeric | Percentage of rows to sample (0-100) | + +##### Optional Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| rows | integer | Approximate number of rows to sample (use with `ROWS` keyword) | + +#### `union_extract(union_col, tag)` -> `duckdb.unresolved_type` + +Extracts a value from a union type by specifying the tag name of the member you want to access. + +```sql +-- Extract the string value if the union contains a string +SELECT union_extract(my_union_column, 'string') FROM my_table; + +-- Extract integer value from union +SELECT union_extract(data_field, 'integer') AS extracted_int FROM mixed_data; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| union_col | duckdb.union or duckdb.unresolved_type | The union column to extract from | +| tag | text | The tag name of the union member to extract | + +#### `union_tag(union_col)` -> `duckdb.unresolved_type` + +Returns the tag name of the currently active member in a union type. + +```sql +-- Get the active tag for each row +SELECT union_tag(my_union_column) AS active_type FROM my_table; + +-- Filter rows based on union tag +SELECT * FROM my_table WHERE union_tag(data_field) = 'string'; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| union_col | duckdb.union or duckdb.unresolved_type | The union column to get the tag from | + +#### `approx_count_distinct(expression)` -> `BIGINT` + +Approximates the count of distinct elements using the HyperLogLog algorithm. This is much faster than `COUNT(DISTINCT ...)` for large datasets, with a small error rate. + +```sql +-- Approximate distinct count of customer IDs +SELECT approx_count_distinct(customer_id) FROM orders; + +-- Compare with exact count +SELECT + approx_count_distinct(customer_id) AS approx_distinct, + COUNT(DISTINCT customer_id) AS exact_distinct +FROM orders; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| expression | any | The expression to count distinct values for |