|
20 | 20 | FSSPEC_FILESYSTEMS = ("http", "https") |
21 | 21 | FSSPEC_BLOCK_SIZE = 32 * 1024 |
22 | 22 |
|
23 | | -# Filesystems for which calling .is_dir() may be very slow |
| 23 | +# Filesystems for which calling .is_dir() may be very slow and/or .iterdir() |
| 24 | +# may yield non-parquet paths. |
24 | 25 | NO_ITERDIR_FILESYSTEMS = ( |
25 | 26 | "http", |
26 | 27 | "https", |
@@ -194,13 +195,17 @@ def _read_parquet_into_table( |
194 | 195 | # NOTE: the test for _is_local_dir is sufficient, because we're |
195 | 196 | # preserving a path to pq.read_table, which can read local |
196 | 197 | # directories, but not remote directories. Remote directories |
197 | | - # are supported separately via _read_parquet_directory |
| 198 | + # are supported separately via _read_parquet_directory. |
| 199 | + # We don't support HTTP "directories", because 1) calling .is_dir() |
| 200 | + # may be very expensive, because it downloads content first, |
| 201 | + # 2) because .iter_dir() is likely to return a lot of "junk" |
| 202 | + # besides of the actual parquet files. |
198 | 203 | if isinstance(data, str | Path | UPath) and not _is_local_dir(path_to_data := UPath(data)): |
199 | 204 | storage_options = _get_storage_options(path_to_data) |
200 | 205 | filesystem = kwargs.get("filesystem") |
201 | 206 | if not filesystem: |
202 | 207 | _, filesystem = _transform_read_parquet_data_arg(path_to_data) |
203 | | - # Check original string, because UPath may chomp trailing "/" |
| 208 | + # Will not detect HTTP(S) directories. |
204 | 209 | if _is_remote_dir(data, path_to_data): |
205 | 210 | return _read_remote_parquet_directory( |
206 | 211 | path_to_data, filesystem, storage_options, columns, **kwargs |
@@ -255,14 +260,15 @@ def _read_remote_parquet_directory( |
255 | 260 | # handlers. This would work for e.g. S3, but not for HTTP(S). |
256 | 261 | if _is_remote_dir(upath, upath): |
257 | 262 | table = _read_remote_parquet_directory(upath, filesystem, storage_options, columns, **kwargs) |
258 | | - with fsspec.parquet.open_parquet_file( |
259 | | - upath.path, |
260 | | - columns=columns, |
261 | | - storage_options=storage_options, |
262 | | - fs=filesystem, |
263 | | - engine="pyarrow", |
264 | | - ) as parquet_file: |
265 | | - table = pq.read_table(parquet_file, columns=columns, **kwargs) |
| 263 | + else: |
| 264 | + with fsspec.parquet.open_parquet_file( |
| 265 | + upath.path, |
| 266 | + columns=columns, |
| 267 | + storage_options=storage_options, |
| 268 | + fs=filesystem, |
| 269 | + engine="pyarrow", |
| 270 | + ) as parquet_file: |
| 271 | + table = pq.read_table(parquet_file, columns=columns, **kwargs) |
266 | 272 | tables.append(table) |
267 | 273 | return pa.concat_tables(tables) |
268 | 274 |
|
|
0 commit comments