pandas-dev · datapythonista · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025
@@ -28,10 +28,10 @@ dependencies:
   - beautifulsoup4=4.12.3
   - bottleneck=1.3.6
   - fastparquet=2024.2.0
-  - fsspec=2024.2.0
+  - fsspec=2023.12.2
   - html5lib=1.1
   - hypothesis=6.84.0
-  - gcsfs=2024.2.0
+  - gcsfs=2023.12.2
   - jinja2=3.1.3
   - lxml=4.9.2
   - matplotlib=3.8.3
@@ -42,14 +42,15 @@ dependencies:
   - openpyxl=3.1.2
   - psycopg2=2.9.6
   - pyarrow=10.0.1
+  - pyiceberg=0.7.1
   - pymysql=1.1.0
   - pyqt=5.15.9
   - pyreadstat=1.2.6
   - pytables=3.8.0
   - python-calamine=0.1.7
   - pytz=2023.4
   - pyxlsb=1.0.10
-  - s3fs=2024.2.0
+  - s3fs=2023.12.2
   - scipy=1.12.0
   - sqlalchemy=2.0.0
   - tabulate=0.9.0

@@ -26,10 +26,10 @@ dependencies:
   - beautifulsoup4>=4.12.3
   - bottleneck>=1.3.6
   - fastparquet>=2024.2.0
-  - fsspec>=2024.2.0
+  - fsspec>=2023.12.2
   - html5lib>=1.1
   - hypothesis>=6.84.0
-  - gcsfs>=2024.2.0
+  - gcsfs>=2023.12.2
   - jinja2>=3.1.3
   - lxml>=4.9.2
   - matplotlib>=3.8.3
@@ -40,14 +40,15 @@ dependencies:
   - openpyxl>=3.1.2
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
+  - pyiceberg>=0.7.1
   - pymysql>=1.1.0
   - pyqt>=5.15.9
   - pyreadstat>=1.2.6
   - pytables>=3.8.0
   - python-calamine>=0.1.7
   - pytz>=2023.4
   - pyxlsb>=1.0.10
-  - s3fs>=2024.2.0
+  - s3fs>=2023.12.2
   - scipy>=1.12.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0

@@ -27,10 +27,10 @@ dependencies:
   - beautifulsoup4>=4.12.3
   - bottleneck>=1.3.6
   - fastparquet>=2024.2.0
-  - fsspec>=2024.2.0
+  - fsspec>=2023.12.2
   - html5lib>=1.1
   - hypothesis>=6.84.0
-  - gcsfs>=2024.2.0
+  - gcsfs>=2023.12.2
   - jinja2>=3.1.3
   - lxml>=4.9.2
   - matplotlib>=3.8.3
@@ -41,14 +41,15 @@ dependencies:
   - openpyxl>=3.1.2
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
+  - pyiceberg>=0.7.1
   - pymysql>=1.1.0
   - pyqt>=5.15.9
   - pyreadstat>=1.2.6
   - pytables>=3.8.0
   - python-calamine>=0.1.7
   - pytz>=2023.4
   - pyxlsb>=1.0.10
-  - s3fs>=2024.2.0
+  - s3fs>=2023.12.2
   - scipy>=1.12.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0

@@ -26,10 +26,10 @@ dependencies:
   - beautifulsoup4>=4.12.3
   - bottleneck>=1.3.6
   - fastparquet>=2024.2.0
-  - fsspec>=2024.2.0
+  - fsspec>=2023.12.2
   - html5lib>=1.1
   - hypothesis>=6.84.0
-  - gcsfs>=2024.2.0
+  - gcsfs>=2023.12.2
   - jinja2>=3.1.3
   - lxml>=4.9.2
   - matplotlib>=3.8.3
@@ -41,13 +41,14 @@ dependencies:
   - openpyxl>=3.1.2
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
+  - pyiceberg>=0.7.1
   - pymysql>=1.1.0
   - pyreadstat>=1.2.6
   - pytables>=3.8.0
   - python-calamine>=0.1.7
   - pytz>=2023.4
   - pyxlsb>=1.0.10
-  - s3fs>=2024.2.0
+  - s3fs>=2023.12.2
   - scipy>=1.12.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0

@@ -26,10 +26,10 @@ dependencies:
   - beautifulsoup4>=4.12.3
   - bottleneck>=1.3.6
   - fastparquet>=2024.2.0
-  - fsspec>=2024.2.0
+  - fsspec>=2023.12.2
   - html5lib>=1.1
   - hypothesis>=6.84.0
-  - gcsfs>=2024.2.0
+  - gcsfs>=2023.12.2
   - jinja2>=3.1.3
   - lxml>=4.9.2
   - matplotlib>=3.8.3
@@ -41,13 +41,14 @@ dependencies:
   - openpyxl>=3.1.2
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
+  - pyiceberg>=0.7.1
   - pymysql>=1.1.0
   - pyreadstat>=1.2.6
   - pytables>=3.8.0
   - python-calamine>=0.1.7
   - pytz>=2023.4
   - pyxlsb>=1.0.10
-  - s3fs>=2024.2.0
+  - s3fs>=2023.12.2
   - scipy>=1.12.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0

@@ -27,10 +27,10 @@ dependencies:
   - blosc>=1.21.3
   - bottleneck>=1.3.6
   - fastparquet>=2024.2.0
-  - fsspec>=2024.2.0
+  - fsspec>=2023.12.2
   - html5lib>=1.1
   - hypothesis>=6.84.0
-  - gcsfs>=2024.2.0
+  - gcsfs>=2023.12.2
   - jinja2>=3.1.3
   - lxml>=4.9.2
   - matplotlib>=3.8.3
@@ -48,7 +48,7 @@ dependencies:
   - python-calamine>=0.1.7
   - pytz>=2023.4
   - pyxlsb>=1.0.10
-  - s3fs>=2024.2.0
+  - s3fs>=2023.12.2
   - scipy>=1.12.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -299,7 +299,7 @@ Dependency                                                         Minimum Versi
 Other data sources
 ^^^^^^^^^^^^^^^^^^
 
-Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"``
+Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"``
 
 ====================================================== ================== ================ ==========================================================
 Dependency                                             Minimum Version    pip extra        Notes
@@ -308,6 +308,7 @@ Dependency                                             Minimum Version    pip ex
 `zlib <https://github.com/madler/zlib>`__                                 hdf5             Compression for HDF5
 `fastparquet <https://github.com/dask/fastparquet>`__  2024.2.0           -                Parquet reading / writing (pyarrow is default)
 `pyarrow <https://github.com/apache/arrow>`__          10.0.1             parquet, feather Parquet, ORC, and feather reading / writing
+`PyIceberg <https://py.iceberg.apache.org/>`__         0.7.1              iceberg          Apache Iceberg reading
 `pyreadstat <https://github.com/Roche/pyreadstat>`__   1.2.6              spss             SPSS files (.sav) reading
 `odfpy <https://github.com/eea/odfpy>`__               1.4.1              excel            Open document format (.odf, .ods, .odt) reading / writing
 ====================================================== ================== ================ ==========================================================
@@ -328,10 +329,10 @@ Installable with ``pip install "pandas[fss, aws, gcp]"``
 ============================================ ================== =============== ==========================================================
 Dependency                                   Minimum Version    pip extra       Notes
 ============================================ ================== =============== ==========================================================
-`fsspec <https://github.com/fsspec>`__       2024.2.0           fss, gcp, aws   Handling files aside from simple local and HTTP (required
+`fsspec <https://github.com/fsspec>`__       2023.12.2          fss, gcp, aws   Handling files aside from simple local and HTTP (required
                                                                                 dependency of s3fs, gcsfs).
-`gcsfs <https://github.com/fsspec/gcsfs>`__  2024.2.0           gcp             Google Cloud Storage access
-`s3fs <https://github.com/fsspec/s3fs>`__    2024.2.0           aws             Amazon S3 access
+`gcsfs <https://github.com/fsspec/gcsfs>`__  2023.12.2          gcp             Google Cloud Storage access
+`s3fs <https://github.com/fsspec/s3fs>`__    2023.12.2          aws             Amazon S3 access
 ============================================ ================== =============== ==========================================================
 
 Clipboard

diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
@@ -156,6 +156,15 @@ Parquet
    read_parquet
    DataFrame.to_parquet
 
+Iceberg
+~~~~~~~
+.. autosummary::
+    :toctree: api/
+
+    read_iceberg
+
+.. warning:: ``read_iceberg`` is experimental and may change without warning.
+
 ORC
 ~~~
 .. autosummary::

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -29,6 +29,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     binary,`HDF5 Format <https://support.hdfgroup.org/documentation/hdf5/latest/_intro_h_d_f5.html>`__, :ref:`read_hdf<io.hdf5>`, :ref:`to_hdf<io.hdf5>`
     binary,`Feather Format <https://github.com/wesm/feather>`__, :ref:`read_feather<io.feather>`, :ref:`to_feather<io.feather>`
     binary,`Parquet Format <https://parquet.apache.org/>`__, :ref:`read_parquet<io.parquet>`, :ref:`to_parquet<io.parquet>`
+    binary,`Apache Iceberg <https://iceberg.apache.org/>`__, :ref:`read_iceberg<io.iceberg>` , NA
     binary,`ORC Format <https://orc.apache.org/>`__, :ref:`read_orc<io.orc>`, :ref:`to_orc<io.orc>`
     binary,`Stata <https://en.wikipedia.org/wiki/Stata>`__, :ref:`read_stata<io.stata_reader>`, :ref:`to_stata<io.stata_writer>`
     binary,`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__, :ref:`read_sas<io.sas_reader>` , NA
@@ -5403,6 +5404,102 @@ The above example creates a partitioned dataset that may look like:
    except OSError:
        pass
 
+.. _io.iceberg:
+
+Iceberg
+-------
+
+.. versionadded:: 3.0.0
+
+Apache Iceberg is a high performance open-source format for large analytic tables.
+Iceberg enables the use of SQL tables for big data while making it possible for different
+engines to safely work with the same tables at the same time.
+
+Iceberg support predicate pushdown and column pruning, which are available to pandas
+users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg`
+function. This is convenient to extract from large tables a subset that fits in memory asa
+pandas ``DataFrame``.
+
+Internally, pandas uses PyIceberg_ to query Iceberg.
+
+.. _PyIceberg: https://py.iceberg.apache.org/
+
+A simple example loading all data from an Iceberg table ``my_table`` defined in the
+``my_catalog`` catalog.
+
+.. code-block:: python
+
+    df = pd.read_iceberg("my_table", catalog_name="my_catalog")
+
+Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory.
+It is possible to to change properties of the catalog definition with the
+``catalog_properties`` parameter:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        catalog_properties={"s3.secret-access-key": "my_secret"},
+    )
+
+It is also possible to fully specify the catalog in ``catalog_properties`` and not provide
+a ``catalog_name``:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_properties={
+            "uri": "http://127.0.0.1:8181",
+            "s3.endpoint": "http://127.0.0.1:9000",
+        },
+    )
+
+To create the ``DataFrame`` with only a subset of the columns:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        selected_fields=["my_column_3", "my_column_7"]
+    )
+
+This will execute the function faster, since other columns won't be read. And it will also
+save memory, since the data from other columns won't be loaded into the underlying memory of
+the ``DataFrame``.
+
+To fetch only a subset of the rows, we can do it with the ``limit`` parameter:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        limit=100,
+    )
+
+This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in
+the table.
+
+To fetch a subset of the rows based on a condition, this can be done using the ``row_filter``
+parameter:
+
+.. code-block:: python
+
+    df = pd.read_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        row_filter="distance > 10.0",
+    )
+
+Reading a particular snapshot is also possible providing the snapshot ID as an argument to
+``snapshot_id``.
+
+More information about the Iceberg format can be found in the `Apache Iceberg official
+page <https://iceberg.apache.org/>`__.
+
 .. _io.orc:
 
 ORC

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -78,6 +78,7 @@ Other enhancements
 - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
 - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
 - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
+- Added support to read from Apache Iceberg tables with the new :func:`read_iceberg` function (:issue:`61383`)
 - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
 - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
 - Improved deprecation message for offset aliases (:issue:`60820`)

diff --git a/environment.yml b/environment.yml
@@ -29,10 +29,10 @@ dependencies:
   - beautifulsoup4>=4.12.3
   - bottleneck>=1.3.6
   - fastparquet>=2024.2.0
-  - fsspec>=2024.2.0
+  - fsspec>=2023.12.2
   - html5lib>=1.1
   - hypothesis>=6.84.0
-  - gcsfs>=2024.2.0
+  - gcsfs>=2023.12.2
   - ipython
   - pickleshare  # Needed for IPython Sphinx directive in the docs GH#60429
   - jinja2>=3.1.3
@@ -44,13 +44,14 @@ dependencies:
   - odfpy>=1.4.1
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
+  - pyiceberg>=0.7.1
   - pymysql>=1.1.0
   - pyreadstat>=1.2.6
   - pytables>=3.8.0
   - python-calamine>=0.1.7
   - pytz>=2023.4
   - pyxlsb>=1.0.10
-  - s3fs>=2024.2.0
+  - s3fs>=2023.12.2
   - scipy>=1.12.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0

diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -164,6 +164,7 @@
     read_stata,
     read_sas,
     read_spss,
+    read_iceberg,
 )
 
 from pandas.io.json._normalize import json_normalize
@@ -319,6 +320,7 @@
     "read_fwf",
     "read_hdf",
     "read_html",
+    "read_iceberg",
     "read_json",
     "read_orc",
     "read_parquet",