Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ log = "^0.4"
num-traits = { version = "0.2" }
object_store = { version = "0.12.4", default-features = false }
parking_lot = "0.12"
percent-encoding = "2.3"
parquet = { version = "57.1.0", default-features = false, features = [
"arrow",
"async",
Expand Down
1 change: 1 addition & 0 deletions datafusion/catalog-listing/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ futures = { workspace = true }
itertools = { workspace = true }
log = { workspace = true }
object_store = { workspace = true }
percent-encoding = { workspace = true }

[dev-dependencies]
datafusion-datasource-parquet = { workspace = true }
Expand Down
73 changes: 65 additions & 8 deletions datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ use datafusion_expr::{Expr, Volatility};
use datafusion_physical_expr::create_physical_expr;
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use percent_encoding::percent_decode_str;

/// Check whether the given expression can be resolved using only the columns `col_names`.
/// This means that if this function returns true:
Expand Down Expand Up @@ -417,12 +418,15 @@ pub async fn pruned_partition_list<'a>(
}

/// Extract the partition values for the given `file_path` (in the given `table_path`)
/// associated to the partitions defined by `table_partition_cols`
/// associated to the partitions defined by `table_partition_cols`.
///
/// Partition values are URL-decoded, since object stores like S3 encode special
/// characters (e.g., `/` becomes `%2F`) in path segments.
pub fn parse_partitions_for_path<'a, I>(
table_path: &ListingTableUrl,
file_path: &'a Path,
table_partition_cols: I,
) -> Option<Vec<&'a str>>
) -> Option<Vec<String>>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Not sure whether it is worth it to return a Vec<Cow<'a, str>> here ?! E.g. if the partition contains % then decode it and return Cow::Owned(decoded.into_owned()), otherwise `Cow::Borrowed(val).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where
I: IntoIterator<Item = &'a str>,
{
Expand All @@ -431,7 +435,10 @@ where
let mut part_values = vec![];
for (part, expected_partition) in subpath.zip(table_partition_cols) {
match part.split_once('=') {
Some((name, val)) if name == expected_partition => part_values.push(val),
Some((name, val)) if name == expected_partition => {
let decoded = percent_decode_str(val).decode_utf8().ok()?;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should invalid values (like %FF in one of the tests below) return None or val ?!
I think it should behave like %XX - i.e. return val

part_values.push(decoded.into_owned());
}
_ => {
debug!(
"Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'",
Expand Down Expand Up @@ -507,7 +514,7 @@ mod tests {
#[test]
fn test_parse_partitions_for_path() {
assert_eq!(
Some(vec![]),
Some(vec![] as Vec<String>),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/file.csv"),
Expand All @@ -531,15 +538,25 @@ mod tests {
)
);
assert_eq!(
Some(vec!["v1"]),
Some(vec!["v1".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
vec!["mypartition"]
)
);
// URL-encoded partition values should be decoded
// Use Path::parse to avoid double-encoding (Path::from encodes % as %25)
assert_eq!(
Some(vec!["v1"]),
Some(vec!["v/1".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/mypartition=v%2F1/file.csv").unwrap(),
vec!["mypartition"]
)
);
assert_eq!(
Some(vec!["v1".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable/").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
Expand All @@ -556,21 +573,61 @@ mod tests {
)
);
assert_eq!(
Some(vec!["v1", "v2"]),
Some(vec!["v1".to_string(), "v2".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
vec!["mypartition", "otherpartition"]
)
);
assert_eq!(
Some(vec!["v1"]),
Some(vec!["v1".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
vec!["mypartition"]
)
);
assert_eq!(
Some(vec!["John Doe".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/name=John%20Doe/file.csv").unwrap(),
vec!["name"]
)
);
assert_eq!(
Some(vec!["a/b".to_string(), "c d".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/p1=a%2Fb/p2=c%20d/file.csv").unwrap(),
vec!["p1", "p2"]
)
);
assert_eq!(
Some(vec!["Müller".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/name=M%C3%BCller/file.csv").unwrap(),
vec!["name"]
)
);
assert_eq!(
Some(vec!["invalid%XX".to_string()]),
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/p1=invalid%XX/file.csv").unwrap(),
vec!["p1"]
)
);
assert_eq!(
None,
parse_partitions_for_path(
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
&Path::parse("bucket/mytable/p1=%FF/file.csv").unwrap(),
vec!["p1"]
)
);
}

#[test]
Expand Down
Loading