From a2c19f3e23daa73a5b5e34bb9f5b5721042d49e9 Mon Sep 17 00:00:00 2001 From: Jonatan Martens Date: Sat, 29 Nov 2025 14:13:15 +0200 Subject: [PATCH 1/5] Add rename_columns method --- python/pyarrow/_dataset.pyx | 54 ++++++++++++++++++++++++++++ python/pyarrow/tests/test_dataset.py | 29 +++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 666fd2c1cc5..1bf9764cfb8 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -412,6 +412,10 @@ cdef class Dataset(_Weakrefable): n_legs: [[2,4,4,100]] animal: [["Parrot","Dog","Horse","Centipede"]] """ + # Apply column projection from rename_columns() if present + if columns is None and 'columns' in self._scan_options: + columns = self._scan_options['columns'] + return Scanner.from_dataset( self, columns=columns, @@ -990,6 +994,56 @@ cdef class Dataset(_Weakrefable): right_dataset, right_on, right_by, tolerance, output_type=InMemoryDataset) + def rename_columns(self, names): + """ + Apply logical column renaming on the Dataset. + + The rename is applied lazily when data is scanned. Column names in the + files are not changed; the rename is a logical transformation applied + during reads. + + Parameters + ---------- + names : list, tuple, or dict + If a list or tuple, the new names for all columns (must match the + number of columns). If a dict, maps old column names to new names. + + Returns + ------- + Dataset + The existing dataset with column projection applied. + + Examples + -------- + Rename all columns by position: + + >>> dataset.rename_columns(['name', 'age', 'city']).to_table() + + Rename specific columns: + + >>> dataset.rename_columns({'old_name': 'new_name'}).to_table() + """ + import pyarrow.dataset as ds + + schema = self.schema + + if isinstance(names, (list, tuple)): + if len(names) != len(schema): + raise ValueError( + f"Expected {len(schema)} names, got {len(names)}") + name_mapping = {schema.field(i).name: names[i] + for i in range(len(names))} + elif isinstance(names, dict): + name_mapping = names + else: + raise TypeError(f"names must be list, tuple, or dict, not {type(names)!r}") + + projection = {new_name: ds.field(old_name) + for old_name, new_name in name_mapping.items()} + + self._scan_options['columns'] = projection + + return self cdef class InMemoryDataset(Dataset): """ diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 32bcebb28de..13f76040195 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5932,3 +5932,32 @@ def test_scanner_from_substrait(dataset): filter=ps.BoundExpressions.from_substrait(filtering) ).to_table() assert result.to_pydict() == {'str': ['4', '4']} + + +@pytest.mark.parametrize("names", [ + ["new-index", "new-color"], + ("new-index", "new-color"), + {"index": "new-index", "color": "new-color"} +] +) +def test_rename_columns(names): + original_schema = pa.schema([ + pa.field('index', pa.int64()), + pa.field('color', pa.string()), + ] + ) + + dataset = ds.InMemoryDataset( + pa.RecordBatch.from_pylist( + [{"index": 1, "color": "green"}, {"index": 2, "color": "blue"}]), + schema=original_schema + ) + + dataset.rename_columns(names) + + expected_schema = pa.schema([ + pa.field("new-index", pa.int64()), + pa.field("new-color", pa.string()) + ]) + + assert dataset.to_table().schema.equals(expected_schema) From 04969a5a2ad47eed57f14fda30a539d101d97cb1 Mon Sep 17 00:00:00 2001 From: Jonatan Martens Date: Thu, 4 Dec 2025 09:56:06 +0200 Subject: [PATCH 2/5] Remove examples from docstring --- python/pyarrow/_dataset.pyx | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 1bf9764cfb8..4dad8567815 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1012,16 +1012,6 @@ cdef class Dataset(_Weakrefable): ------- Dataset The existing dataset with column projection applied. - - Examples - -------- - Rename all columns by position: - - >>> dataset.rename_columns(['name', 'age', 'city']).to_table() - - Rename specific columns: - - >>> dataset.rename_columns({'old_name': 'new_name'}).to_table() """ import pyarrow.dataset as ds From f48a7384a4dd74c029a9627bc9c6731703852fc9 Mon Sep 17 00:00:00 2001 From: Jonatan Martens Date: Thu, 4 Dec 2025 14:31:17 +0200 Subject: [PATCH 3/5] Add docstring with proper imports --- python/pyarrow/_dataset.pyx | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 4dad8567815..8a2040c0d23 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1012,6 +1012,24 @@ cdef class Dataset(_Weakrefable): ------- Dataset The existing dataset with column projection applied. + + Examples + -------- + Rename all columns by position: + + >>> import pyarrow as pa + >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], + ... 'n_legs': [2, 2, 4, 4, 5, 100], + ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", + ... "Brittle stars", "Centipede"]}) + + >>> import pyarrow.dataset as ds + >>> dataset = ds.InMemoryDataset([table]) + >>> dataset.rename_columns(['time', 'number_of_legs', 'name']).to_table() + + Rename specific columns: + + >>> dataset.rename_columns({'n_legs': 'number_of_legs'}).to_table() """ import pyarrow.dataset as ds From 70e520d9c0fb6b3c3192dfcf43be2e7ae9e99a0d Mon Sep 17 00:00:00 2001 From: Jonatan Martens Date: Thu, 4 Dec 2025 19:52:09 +0200 Subject: [PATCH 4/5] Add outputs to examples --- python/pyarrow/_dataset.pyx | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 8a2040c0d23..9ce43757d0f 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1026,10 +1026,26 @@ cdef class Dataset(_Weakrefable): >>> import pyarrow.dataset as ds >>> dataset = ds.InMemoryDataset([table]) >>> dataset.rename_columns(['time', 'number_of_legs', 'name']).to_table() + pyarrow.Table + time: int64 + number_of_legs: int64 + name: string + ---- + time: [[2020,2022,2021,2022,2019,2021]] + number_of_legs: [[2,2,4,4,5,100]] + name: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] Rename specific columns: >>> dataset.rename_columns({'n_legs': 'number_of_legs'}).to_table() + pyarrow.Table + year: int64 + number_of_legs: int64 + animal: string + ---- + year: [[2020,2022,2021,2022,2019,2021]] + number_of_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] """ import pyarrow.dataset as ds From 5e6abfd705ac19fdd5ffb422852f170ea94faa79 Mon Sep 17 00:00:00 2001 From: Jonatan Martens Date: Thu, 4 Dec 2025 20:36:17 +0200 Subject: [PATCH 5/5] Fix columns not in rename dict being ignored --- python/pyarrow/_dataset.pyx | 3 ++- python/pyarrow/tests/test_dataset.py | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 9ce43757d0f..af495ae4c2f 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1058,7 +1058,8 @@ cdef class Dataset(_Weakrefable): name_mapping = {schema.field(i).name: names[i] for i in range(len(names))} elif isinstance(names, dict): - name_mapping = names + name_mapping = {field.name: names.get(field.name, field.name) + for field in schema} else: raise TypeError(f"names must be list, tuple, or dict, not {type(names)!r}") diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 13f76040195..f60fcd2c7d9 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5934,13 +5934,18 @@ def test_scanner_from_substrait(dataset): assert result.to_pydict() == {'str': ['4', '4']} -@pytest.mark.parametrize("names", [ - ["new-index", "new-color"], - ("new-index", "new-color"), - {"index": "new-index", "color": "new-color"} +@pytest.mark.parametrize("names, expected_schema", [ + (["new-index", "new-color"], + pa.schema([pa.field("new-index", pa.int64()), pa.field("new-color", pa.string())])), + (("new-index", "new-color"), + pa.schema([pa.field("new-index", pa.int64()), pa.field("new-color", pa.string())])), + ({"index": "new-index", "color": "new-color"}, + pa.schema([pa.field("new-index", pa.int64()), pa.field("new-color", pa.string())])), + ({"index": "new-index"}, + pa.schema([pa.field("new-index", pa.int64()), pa.field("color", pa.string())])), ] ) -def test_rename_columns(names): +def test_rename_columns(names, expected_schema): original_schema = pa.schema([ pa.field('index', pa.int64()), pa.field('color', pa.string()), @@ -5955,9 +5960,4 @@ def test_rename_columns(names): dataset.rename_columns(names) - expected_schema = pa.schema([ - pa.field("new-index", pa.int64()), - pa.field("new-color", pa.string()) - ]) - assert dataset.to_table().schema.equals(expected_schema)