Skip to content

Commit 90c7527

Browse files
committed
Fix a bug when reading directory twice
1 parent 4cf1314 commit 90c7527

File tree

1 file changed

+17
-11
lines changed
  • src/nested_pandas/nestedframe

1 file changed

+17
-11
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
FSSPEC_FILESYSTEMS = ("http", "https")
2121
FSSPEC_BLOCK_SIZE = 32 * 1024
2222

23-
# Filesystems for which calling .is_dir() may be very slow
23+
# Filesystems for which calling .is_dir() may be very slow and/or .iterdir()
24+
# may yield non-parquet paths.
2425
NO_ITERDIR_FILESYSTEMS = (
2526
"http",
2627
"https",
@@ -194,13 +195,17 @@ def _read_parquet_into_table(
194195
# NOTE: the test for _is_local_dir is sufficient, because we're
195196
# preserving a path to pq.read_table, which can read local
196197
# directories, but not remote directories. Remote directories
197-
# are supported separately via _read_parquet_directory
198+
# are supported separately via _read_parquet_directory.
199+
# We don't support HTTP "directories", because 1) calling .is_dir()
200+
# may be very expensive, because it downloads content first,
201+
# 2) because .iter_dir() is likely to return a lot of "junk"
202+
# besides of the actual parquet files.
198203
if isinstance(data, str | Path | UPath) and not _is_local_dir(path_to_data := UPath(data)):
199204
storage_options = _get_storage_options(path_to_data)
200205
filesystem = kwargs.get("filesystem")
201206
if not filesystem:
202207
_, filesystem = _transform_read_parquet_data_arg(path_to_data)
203-
# Check original string, because UPath may chomp trailing "/"
208+
# Will not detect HTTP(S) directories.
204209
if _is_remote_dir(data, path_to_data):
205210
return _read_remote_parquet_directory(
206211
path_to_data, filesystem, storage_options, columns, **kwargs
@@ -255,14 +260,15 @@ def _read_remote_parquet_directory(
255260
# handlers. This would work for e.g. S3, but not for HTTP(S).
256261
if _is_remote_dir(upath, upath):
257262
table = _read_remote_parquet_directory(upath, filesystem, storage_options, columns, **kwargs)
258-
with fsspec.parquet.open_parquet_file(
259-
upath.path,
260-
columns=columns,
261-
storage_options=storage_options,
262-
fs=filesystem,
263-
engine="pyarrow",
264-
) as parquet_file:
265-
table = pq.read_table(parquet_file, columns=columns, **kwargs)
263+
else:
264+
with fsspec.parquet.open_parquet_file(
265+
upath.path,
266+
columns=columns,
267+
storage_options=storage_options,
268+
fs=filesystem,
269+
engine="pyarrow",
270+
) as parquet_file:
271+
table = pq.read_table(parquet_file, columns=columns, **kwargs)
266272
tables.append(table)
267273
return pa.concat_tables(tables)
268274

0 commit comments

Comments
 (0)