@@ -57,6 +57,11 @@ def read_parquet(
5757
5858 Notes
5959 -----
60+ For remote storage (S3, GCS, HTTP/HTTPS), this function automatically uses
61+ fsspec.parquet.open_parquet_file for optimized access with intelligent
62+ precaching, which can significantly improve performance compared to standard
63+ PyArrow reading.
64+
6065 pyarrow supports partial loading of nested structures from parquet, for
6166 example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
6267 load the "a" column of the "nested" column. Standard pandas/pyarrow
@@ -94,8 +99,11 @@ def read_parquet(
9499 reject_nesting = [reject_nesting ]
95100
96101 # First load through pyarrow
102+ # If data is remote, use fsspec.parquet for better performance
103+ if _should_use_fsspec_optimization (data , kwargs .get ("filesystem" )):
104+ table = _read_with_fsspec_optimization (data , columns , kwargs )
97105 # If `filesystem` is specified - use it
98- if kwargs .get ("filesystem" ) is not None :
106+ elif kwargs .get ("filesystem" ) is not None :
99107 table = pq .read_table (data , columns = columns , ** kwargs )
100108 # Otherwise convert with a special function
101109 else :
@@ -291,3 +299,88 @@ def _cast_list_cols_to_nested(df):
291299 if pa .types .is_list (dtype .pyarrow_dtype ):
292300 df [col ] = pack_lists (df [[col ]])
293301 return df
302+
303+
304+ def _should_use_fsspec_optimization (data , explicit_filesystem ):
305+ """Determine if fsspec optimization should be used.
306+
307+ Parameters
308+ ----------
309+ data : str, Path, UPath, or file-like object
310+ The data source
311+ explicit_filesystem : filesystem or None
312+ Explicitly provided filesystem
313+
314+ Returns
315+ -------
316+ bool
317+ True if fsspec optimization should be used for this data source
318+ """
319+ # Don't use optimization if explicit filesystem is provided
320+ if explicit_filesystem is not None :
321+ return False
322+
323+ # Don't use for file-like objects
324+ if hasattr (data , "read" ):
325+ return False
326+
327+ # For UPath objects, check if they're remote (check before Path since UPath inherits from Path)
328+ if isinstance (data , UPath ):
329+ return data .protocol not in ("" , "file" )
330+
331+ # Don't use for Path objects (local files)
332+ if isinstance (data , Path ):
333+ return False
334+
335+ # For strings, check if they look like remote URLs
336+ if isinstance (data , str ):
337+ return data .startswith (("http://" , "https://" , "s3://" , "gs://" , "gcs://" , "azure://" , "adl://" ))
338+
339+ return False
340+
341+
342+ def _read_with_fsspec_optimization (data , columns , kwargs ):
343+ """Read parquet using fsspec optimization for better remote storage performance.
344+
345+ Parameters
346+ ----------
347+ data : str, UPath, or path-like
348+ Path to the parquet file
349+ columns : list or None
350+ Columns to read
351+ kwargs : dict
352+ Additional kwargs for reading
353+
354+ Returns
355+ -------
356+ pyarrow.Table
357+ The loaded table
358+ """
359+ try :
360+ import fsspec .parquet
361+ except ImportError :
362+ # Fall back to regular method if fsspec.parquet not available
363+ data_converted , filesystem = _transform_read_parquet_data_arg (data )
364+ return pq .read_table (data_converted , filesystem = filesystem , columns = columns , ** kwargs )
365+
366+ # Convert UPath to string if needed
367+ if isinstance (data , UPath ):
368+ path_str = str (data )
369+ # Use UPath storage options for fsspec
370+ storage_options = data .storage_options if data .storage_options else None
371+ else :
372+ path_str = str (data )
373+ storage_options = None
374+
375+ # Use fsspec.parquet.open_parquet_file for optimized access
376+ try :
377+ with fsspec .parquet .open_parquet_file (
378+ path_str , columns = columns , storage_options = storage_options , engine = "pyarrow"
379+ ) as parquet_file :
380+ # Read the table using PyArrow with the optimized file handle
381+ table = pq .read_table (parquet_file , columns = columns , ** kwargs )
382+ return table
383+ except Exception :
384+ # Fall back to regular method if optimization fails
385+ data_converted , filesystem = _transform_read_parquet_data_arg (data )
386+ return pq .read_table (data_converted , filesystem = filesystem , columns = columns , ** kwargs )
0 commit comments