rapidsai · rapids-bot · Jun 25, 2025 · Jun 2, 2025 · Jun 3, 2025 · Jun 3, 2025
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import functools
+import inspect
 from typing import TYPE_CHECKING
 
 import polars as pl
@@ -37,6 +38,19 @@
 __all__: list[str] = ["Column"]
 
 
+def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
+    """Convert a Polars dtype short repr to a Polars dtype."""
+    # limitations of dtype_short_repr_to_dtype described in
+    # py-polars/polars/datatypes/convert.py#L299
+    if dtype_str.startswith("list["):
+        stripped = dtype_str.removeprefix("list[").removesuffix("]")
+        return pl.List(_dtype_short_repr_to_dtype(stripped))
+    pl_type = pl.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
+    if pl_type is None:
+        raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
+    return pl_type() if inspect.isclass(pl_type) else pl_type
+
+
 class Column:
     """An immutable column with sortedness metadata."""
 
@@ -48,19 +62,17 @@ class Column:
     # Optional name, only ever set by evaluation of NamedExpr nodes
     # The internal evaluation should not care about the name.
     name: str | None
-    # Optional dtype, used for preserving dtype metadata like
-    # struct fields
-    dtype: DataType | None
+    dtype: DataType
 
     def __init__(
         self,
         column: plc.Column,
+        dtype: DataType,
         *,
         is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
         order: plc.types.Order = plc.types.Order.ASCENDING,
         null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
         name: str | None = None,
-        dtype: DataType | None = None,
     ):
         self.obj = column
         self.is_scalar = self.size == 1
@@ -98,12 +110,9 @@ def deserialize_ctor_kwargs(
         column_kwargs: ColumnOptions,
     ) -> DeserializedColumnOptions:
         """Deserialize the constructor kwargs for a Column."""
-        if (serialized_dtype := column_kwargs.get("dtype", None)) is not None:
-            dtype: DataType | None = DataType(  # pragma: no cover
-                pl.datatypes.convert.dtype_short_repr_to_dtype(serialized_dtype)
-            )
-        else:  # pragma: no cover
-            dtype = None  # pragma: no cover
+        dtype = DataType(  # pragma: no cover
+            _dtype_short_repr_to_dtype(column_kwargs["dtype"])
+        )
         return {
             "is_sorted": column_kwargs["is_sorted"],
             "order": column_kwargs["order"],
@@ -142,15 +151,12 @@ def serialize(
 
     def serialize_ctor_kwargs(self) -> ColumnOptions:
         """Serialize the constructor kwargs for self."""
-        serialized_dtype = (
-            None if self.dtype is None else pl.polars.dtype_str_repr(self.dtype.polars)
-        )
         return {
             "is_sorted": self.is_sorted,
             "order": self.order,
             "null_order": self.null_order,
             "name": self.name,
-            "dtype": serialized_dtype,
+            "dtype": pl.polars.dtype_str_repr(self.dtype.polars),
         }
 
     @functools.cached_property
@@ -406,7 +412,7 @@ def mask_nans(self) -> Self:
         if plc.traits.is_floating_point(self.obj.type()):
             old_count = self.null_count
             mask, new_count = plc.transform.nans_to_nulls(self.obj)
-            result = type(self)(self.obj.with_mask(mask, new_count))
+            result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
             if old_count == new_count:
                 return result.sorted_like(self)
             return result
@@ -454,4 +460,4 @@ def slice(self, zlice: Slice | None) -> Self:
             conversion.from_polars_slice(zlice, num_rows=self.size),
         )
         (column,) = table.columns()
-        return type(self)(column, name=self.name).sorted_like(self)
+        return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)
@@ -27,7 +27,7 @@
 
 
 def _create_polars_column_metadata(
-    name: str | None, dtype: pl.DataType | None
+    name: str | None, dtype: pl.DataType
 ) -> plc.interop.ColumnMetadata:
     """Create ColumnMetadata preserving pl.Struct field names."""
     if isinstance(dtype, pl.Struct):
@@ -72,6 +72,7 @@ def __init__(self, columns: Iterable[Column]) -> None:
         if any(c.name is None for c in columns):
             raise ValueError("All columns must have a name")
         self.columns = [cast(NamedColumn, c) for c in columns]
+        self.dtypes = [c.dtype for c in self.columns]
         self.column_map = {c.name: c for c in self.columns}
         self.table = plc.Table([c.obj for c in self.columns])
 
@@ -89,12 +90,8 @@ def to_polars(self) -> pl.DataFrame:
         # serialise with names we control and rename with that map.
         name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
         metadata = [
-            _create_polars_column_metadata(
-                name,
-                # Can remove the getattr if we ever consistently set Column.dtype
-                getattr(col.dtype, "polars", None),
-            )
-            for name, col in zip(name_map, self.columns, strict=True)
+            _create_polars_column_metadata(name, dtype.polars)
+            for name, dtype in zip(name_map, self.dtypes, strict=True)
         ]
         table_with_metadata = _ObjectWithArrowMetadata(self.table, metadata)
         df = pl.DataFrame(table_with_metadata)
@@ -148,7 +145,9 @@ def from_polars(cls, df: pl.DataFrame) -> Self:
         )
 
     @classmethod
-    def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
+    def from_table(
+        cls, table: plc.Table, names: Sequence[str], dtypes: Sequence[DataType]
+    ) -> Self:
         """
         Create from a pylibcudf table.
 
@@ -158,6 +157,8 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
             Pylibcudf table to obtain columns from
         names
             Names for the columns
+        dtypes
+            Dtypes for the columns
 
         Returns
         -------
@@ -172,9 +173,8 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            # TODO: Pass along dtypes here
-            Column(c, name=name)
-            for c, name in zip(table.columns(), names, strict=True)
+            Column(c, name=name, dtype=dtype)
+            for c, name, dtype in zip(table.columns(), names, dtypes, strict=True)
         )
 
     @classmethod
@@ -317,7 +317,11 @@ def select_columns(self, names: Set[str]) -> list[Column]:
     def filter(self, mask: Column) -> Self:
         """Return a filtered table given a mask."""
         table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
-        return type(self).from_table(table, self.column_names).sorted_like(self)
+        return (
+            type(self)
+            .from_table(table, self.column_names, self.dtypes)
+            .sorted_like(self)
+        )
 
     def slice(self, zlice: Slice | None) -> Self:
         """
@@ -338,4 +342,8 @@ def slice(self, zlice: Slice | None) -> Self:
         (table,) = plc.copying.slice(
             self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
         )
-        return type(self).from_table(table, self.column_names).sorted_like(self)
+        return (
+            type(self)
+            .from_table(table, self.column_names, self.dtypes)
+            .sorted_like(self)
+        )
@@ -10,16 +10,13 @@
 from functools import partial, reduce
 from typing import TYPE_CHECKING, Any, ClassVar
 
-import pyarrow as pa
-
 import pylibcudf as plc
 
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, DataType
 from cudf_polars.dsl.expressions.base import (
     ExecutionContext,
     Expr,
 )
-from cudf_polars.dsl.expressions.literal import LiteralColumn
 from cudf_polars.utils.versions import POLARS_VERSION_LT_128
 
 if TYPE_CHECKING:
@@ -28,7 +25,7 @@
     import polars.type_aliases as pl_types
     from polars.polars import _expr_nodes as pl_expr
 
-    from cudf_polars.containers import DataFrame, DataType
+    from cudf_polars.containers import DataFrame
 
 __all__ = ["BooleanFunction"]
 
@@ -99,15 +96,6 @@ def __init__(
             # TODO: If polars IR doesn't put the casts in, we need to
             # mimic the supertype promotion rules.
             raise NotImplementedError("IsIn doesn't support supertype casting")
-        if self.name is BooleanFunction.Name.IsIn:
-            _, haystack = self.children
-            # TODO: Use pl.List isinstance check once we have https://github.com/rapidsai/cudf/pull/18564
-            if isinstance(haystack, LiteralColumn) and isinstance(
-                haystack.value, pa.ListArray
-            ):
-                raise NotImplementedError(
-                    "IsIn does not support nested list column input"
-                )  # pragma: no cover
 
     @staticmethod
     def _distinct(
@@ -302,10 +290,10 @@ def do_evaluate(
             needles, haystack = columns
             if haystack.obj.type().id() == plc.TypeId.LIST:
                 # Unwrap values from the list column
-                haystack = Column(haystack.obj.children()[1])
-                # TODO: Remove check once Column's require dtype
-                if needles.dtype is not None:
-                    haystack = haystack.astype(needles.dtype)
+                haystack = Column(
+                    haystack.obj.children()[1],
+                    dtype=DataType(haystack.dtype.polars.inner),
+                ).astype(needles.dtype)
             if haystack.size:
                 return Column(
                     plc.search.contains(haystack.obj, needles.obj), dtype=self.dtype

@@ -9,12 +9,11 @@
 from enum import IntEnum, auto
 from typing import TYPE_CHECKING, Any
 
-import polars as pl
 from polars.exceptions import InvalidOperationError
 
 import pylibcudf as plc
 
-from cudf_polars.containers import Column, DataType
+from cudf_polars.containers import Column
 from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
 from cudf_polars.dsl.utils.reshape import broadcast
@@ -24,7 +23,7 @@
 
     from polars.polars import _expr_nodes as pl_expr
 
-    from cudf_polars.containers import DataFrame
+    from cudf_polars.containers import DataFrame, DataType
 
 __all__ = ["StringFunction"]
 
@@ -211,9 +210,9 @@ def do_evaluate(
         """Evaluate this expression given a dataframe for context."""
         if self.name is StringFunction.Name.ConcatHorizontal:
             columns = [
-                Column(child.evaluate(df, context=context).obj).astype(
-                    DataType(pl.String())
-                )
+                Column(
+                    child.evaluate(df, context=context).obj, dtype=child.dtype
+                ).astype(self.dtype)
                 for child in self.children
             ]
 
@@ -226,13 +225,12 @@ def do_evaluate(
             return Column(
                 plc.strings.combine.concatenate(
                     plc.Table([col.obj for col in broadcasted]),
-                    plc.Scalar.from_py(delimiter, plc.DataType(plc.TypeId.STRING)),
-                    None
-                    if ignore_nulls
-                    else plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)),
+                    plc.Scalar.from_py(delimiter, self.dtype.plc),
+                    None if ignore_nulls else plc.Scalar.from_py(None, self.dtype.plc),
                     None,
                     plc.strings.combine.SeparatorOnNulls.NO,
-                )
+                ),
+                dtype=self.dtype,
             )
         elif self.name is StringFunction.Name.ConcatVertical:
             (child,) = self.children
@@ -323,20 +321,21 @@ def do_evaluate(
             if self.children[1].value is None:
                 return Column(
                     plc.Column.from_scalar(
-                        plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)),
+                        plc.Scalar.from_py(None, self.dtype.plc),
                         column.size,
-                    )
+                    ),
+                    self.dtype,
                 )
             elif self.children[1].value == 0:
                 result = plc.Column.from_scalar(
-                    plc.Scalar.from_py("", plc.DataType(plc.TypeId.STRING)),
+                    plc.Scalar.from_py("", self.dtype.plc),
                     column.size,
                 )
                 if column.obj.null_mask():
                     result = result.with_mask(
                         column.obj.null_mask(), column.obj.null_count()
                     )
-                return Column(result)
+                return Column(result, self.dtype)
 
             else:
                 start = -(self.children[1].value)
@@ -347,7 +346,8 @@ def do_evaluate(
                         plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
                         plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
                         None,
-                    )
+                    ),
+                    self.dtype,
                 )
         elif self.name is StringFunction.Name.Head:
             column = self.children[0].evaluate(df, context=context)
@@ -358,16 +358,18 @@ def do_evaluate(
             if end is None:
                 return Column(
                     plc.Column.from_scalar(
-                        plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)),
+                        plc.Scalar.from_py(None, self.dtype.plc),
                         column.size,
-                    )
+                    ),
+                    self.dtype,
                 )
             return Column(
                 plc.strings.slice.slice_strings(
                     column.obj,
                     plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT32)),
                     plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
-                )
+                ),
+                self.dtype,
             )
 
         columns = [child.evaluate(df, context=context) for child in self.children]
@@ -446,7 +448,7 @@ def do_evaluate(
             )
         elif self.name is StringFunction.Name.Titlecase:
             (column,) = columns
-            return Column(plc.strings.capitalize.title(column.obj))
+            return Column(plc.strings.capitalize.title(column.obj), dtype=self.dtype)
         raise NotImplementedError(
             f"StringFunction {self.name}"
         )  # pragma: no cover; handled by init raising
@@ -237,7 +237,7 @@ def do_evaluate(
                 null_order=null_order,
             )
         elif self.name == "value_counts":
-            (sort, parallel, name, normalize) = self.options
+            (sort, _, _, normalize) = self.options
             count_agg = [plc.aggregation.count(plc.types.NullPolicy.INCLUDE)]
             gb_requests = [
                 plc.groupby.GroupByRequest(