rapidsai · galipremsagar · Jun 17, 2025 · Jun 18, 2025 · Jun 19, 2025
@@ -11,7 +11,10 @@
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.options import get_option
-from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type
+from cudf.utils.dtypes import (
+    can_convert_to_column,
+    cudf_dtype_to_pa_type,
+)
 
 if TYPE_CHECKING:
     from cudf.core.index import Index

@@ -684,7 +684,9 @@ def data_array_view(
                 raise ValueError(f"Unsupported mode: {mode}")
         else:
             obj = None
-        return cuda.as_cuda_array(obj).view(self.dtype)
+        return cuda.as_cuda_array(obj).view(
+            getattr(self.dtype, "numpy_dtype", self.dtype)
+        )
 
     def mask_array_view(
         self, *, mode: Literal["write", "read"] = "write"
@@ -1227,6 +1229,7 @@ def __setitem__(self, key: Any, value: Any) -> None:
         If ``value`` and ``self`` are of different types, ``value`` is coerced
         to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned.
         """
+        # import pdb;pdb.set_trace()
         value_normalized = self._cast_setitem_value(value)
         if isinstance(key, slice):
             out: ColumnBase | None = self._scatter_by_slice(
@@ -1633,6 +1636,7 @@ def take(
         Skip bounds checking if check_bounds is False.
         Set rows to null for all out of bound indices if nullify is `True`.
         """
+        # import pdb;pdb.set_trace()
         # Handle zero size
         if indices.size == 0:
             return cast(Self, column_empty(row_count=0, dtype=self.dtype))
@@ -1791,6 +1795,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
     @acquire_spill_lock()
     def cast(self, dtype: Dtype) -> ColumnBase:
+        # import pdb;pdb.set_trace()
         result = type(self).from_pylibcudf(
             plc.unary.cast(
                 self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
@@ -1802,10 +1807,16 @@ def cast(self, dtype: Dtype) -> ColumnBase:
         ):
             result.dtype.precision = dtype.precision  # type: ignore[union-attr]
         if cudf.get_option("mode.pandas_compatible") and result.dtype != dtype:
+            if self.dtype.kind == "f" and is_pandas_nullable_extension_dtype(
+                dtype
+            ):
+                result = result.set_mask(self.nans_to_nulls().mask)
+                # result = result.nans_to_nulls()
             result._dtype = dtype
         return result
 
     def astype(self, dtype: DtypeObj, copy: bool = False) -> ColumnBase:
+        # import pdb;pdb.set_trace()
         if self.dtype == dtype:
             result = self
         elif len(self) == 0:
@@ -2246,15 +2257,19 @@ def _return_sentinel_column():
     def copy_if_else(
         self, other: Self | plc.Scalar, boolean_mask: NumericalColumn
     ) -> Self:
-        return type(self).from_pylibcudf(  # type: ignore[return-value]
-            plc.copying.copy_if_else(
-                self.to_pylibcudf(mode="read"),
-                other
-                if isinstance(other, plc.Scalar)
-                else other.to_pylibcudf(mode="read"),
-                boolean_mask.to_pylibcudf(mode="read"),
+        return (
+            type(self)
+            .from_pylibcudf(  # type: ignore[return-value]
+                plc.copying.copy_if_else(
+                    self.to_pylibcudf(mode="read"),
+                    other
+                    if isinstance(other, plc.Scalar)
+                    else other.to_pylibcudf(mode="read"),
+                    boolean_mask.to_pylibcudf(mode="read"),
+                )
             )
-        )
+            ._with_type_metadata(self.dtype)
+        )  # type: ignore[return-value]
 
     def split_by_offsets(
         self, offsets: list[int]
@@ -2948,7 +2963,9 @@ def as_column(
             if isinstance(arbitrary, NumpyExtensionArray):
                 # infer_dtype does not handle NumpyExtensionArray
                 arbitrary = np.array(arbitrary, dtype=object)
-            inferred_dtype = infer_dtype(arbitrary)
+            inferred_dtype = infer_dtype(
+                arbitrary, skipna=not cudf.get_option("mode.pandas_compatible")
+            )
             if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
                 raise MixedTypeError("Cannot create column with mixed types")
             elif dtype is None and inferred_dtype not in (
@@ -2984,6 +3001,14 @@ def as_column(
                 arbitrary,
                 from_pandas=True,
             )
+            if (
+                cudf.get_option("mode.pandas_compatible")
+                and inferred_dtype == "mixed"
+                and not isinstance(
+                    pyarrow_array.type, (pa.ListType, pa.StructType)
+                )
+            ):
+                raise MixedTypeError("Cannot create column with mixed types")
             return as_column(
                 pyarrow_array,
                 dtype=dtype,

@@ -199,6 +199,7 @@ def __invert__(self):
             return super().__invert__()
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
+        # import pdb;pdb.set_trace()
         int_float_dtype_mapping = {
             np.int8: np.float32,
             np.int16: np.float32,
@@ -210,12 +211,34 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             np.uint64: np.float64,
             np.bool_: np.float32,
         }
+        if cudf.get_option("mode.pandas_compatible"):
+            int_float_dtype_mapping = {
+                np.int8: np.float64,
+                np.int16: np.float64,
+                np.int32: np.float64,
+                np.int64: np.float64,
+                np.uint8: np.float64,
+                np.uint16: np.float64,
+                np.uint32: np.float64,
+                np.uint64: np.float64,
+                np.bool_: np.float64,
+            }
 
+        # if self.dtype.kind == "b":
+        #     if op.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]:
+        #         # match behavior with non-masked bool dtype
+        #         raise NotImplementedError("Power and division not supported for boolean dtype")
+        #     elif op in ["__sub__", "__rsub__"]:
+        #         # exception message would include "numpy boolean subtract""
+        #         raise TypeError("Cannot subtract boolean dtype")
+        # return None
         out_dtype = None
         if op in {"__truediv__", "__rtruediv__"}:
             # Division with integer types results in a suitable float.
             if truediv_type := int_float_dtype_mapping.get(self.dtype.type):
-                return self.astype(np.dtype(truediv_type))._binaryop(other, op)
+                return self.astype(
+                    get_dtype_of_same_kind(self.dtype, np.dtype(truediv_type))
+                )._binaryop(other, op)
         elif op in {
             "__lt__",
             "__gt__",
@@ -258,6 +281,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         )
 
         if out_dtype is None:
+            # import pdb;pdb.set_trace()
             out_dtype = find_common_type((self.dtype, other_cudf_dtype))
             if op in {"__mod__", "__floordiv__"}:
                 tmp = self if reflect else other
@@ -270,6 +294,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                     out_dtype = get_dtype_of_same_kind(
                         out_dtype, np.dtype(np.float64)
                     )
+                # elif tmp_dtype.kind == "b":
+                #     out_dtype = get_dtype_of_same_kind(
+                #         out_dtype, np.dtype(np.int8)
+                #     )
 
         if op in {"__and__", "__or__", "__xor__"}:
             if self.dtype.kind == "f" or other_cudf_dtype.kind == "f":
@@ -296,7 +324,19 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             lhs = pa_scalar_to_plc_scalar(lhs)
         elif isinstance(rhs, pa.Scalar):
             rhs = pa_scalar_to_plc_scalar(rhs)
-        return binaryop.binaryop(lhs, rhs, op, out_dtype)
+        res = binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if op in {"__mod__", "__floordiv__"} and tmp_dtype.kind == "b":
+            res = res.astype(
+                get_dtype_of_same_kind(out_dtype, np.dtype(np.int8))
+            )
+        elif (
+            op == "INT_POW"
+            and res.null_count
+            and not isinstance(rhs, plc.Scalar)
+        ):
+            res = res.copy_if_else(lhs, res._get_mask_as_column())
+            pass
+        return res
 
     def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
@@ -338,7 +378,10 @@ def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase:
             #   => np.int64
             # np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8)
             #   => np.int64
-            common_dtype = np.result_type(self.dtype, other)  # noqa: TID251
+
+            common_dtype = np.result_type(
+                getattr(self.dtype, "numpy_dtype", self.dtype), other
+            )
             if common_dtype.kind in {"b", "i", "u", "f"}:
                 if self.dtype.kind == "b" and not isinstance(other, bool):
                     common_dtype = min_signed_type(other)
@@ -405,16 +448,21 @@ def as_decimal_column(self, dtype: DecimalDtype) -> DecimalBaseColumn:
         return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
+        # import pdb;pdb.set_trace()
         if dtype == self.dtype:
             return self
         if cudf.get_option("mode.pandas_compatible"):
             if dtype_to_pylibcudf_type(dtype) == dtype_to_pylibcudf_type(
                 self.dtype
             ):
+                if self.dtype.kind == "f":
+                    res = self.nans_to_nulls()
+                else:
+                    res = self
                 # Short-circuit the cast if the dtypes are equivalent
                 # but not the same type object.
-                self._dtype = dtype
-                return self
+                res._dtype = dtype
+                return res
         return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def all(self, skipna: bool = True) -> bool:

@@ -5748,6 +5748,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
         1  1  2
         2  3  4
         """
+        # import pdb;pdb.set_trace()
         if nan_as_null is no_default:
             nan_as_null = (
                 False if cudf.get_option("mode.pandas_compatible") else None
@@ -8708,6 +8709,7 @@ def from_pandas(obj, nan_as_null=no_default):
     >>> type(pmidx)
     <class 'pandas.core.indexes.multi.MultiIndex'>
     """
+    # import pdb;pdb.set_trace()
     if nan_as_null is no_default:
         nan_as_null = (
             False if cudf.get_option("mode.pandas_compatible") else None

@@ -65,6 +65,7 @@
     find_common_type,
     is_dtype_obj_numeric,
     is_mixed_with_object_dtype,
+    is_pandas_nullable_extension_dtype,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _EQUALITY_OPS, _is_same_name
@@ -205,7 +206,9 @@ def __setitem__(self, key, value):
             # In contrast to Column.__setitem__ (which downcasts the value to
             # the dtype of the column) here we upcast the series to the
             # larger data type mimicking pandas
-            if not (value is None or value is cudf.NA or value is np.nan):
+            if not (value is None or value is cudf.NA or value is np.nan) and (
+                not is_pandas_nullable_extension_dtype(self._frame.dtype)
+            ):
                 tmp_value = as_column(value)
                 if tmp_value.dtype.kind in "uifb" and not (
                     self._frame.dtype.kind == "b"
@@ -1268,6 +1271,7 @@ def __getitem__(self, arg):
 
     @_performance_tracking
     def __setitem__(self, key, value):
+        # import pdb;pdb.set_trace()
         if isinstance(key, slice):
             self.iloc[key] = value
         else:
@@ -2926,6 +2930,8 @@ def unique(self):
         """
         res = self._column.unique()
         if cudf.get_option("mode.pandas_compatible"):
+            if is_pandas_nullable_extension_dtype(self.dtype):
+                raise NotImplementedError("cudf does not support arrays")
             return res.values
         return Series._from_column(res, name=self.name)