Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
from cudf.core.column import as_column
from cudf.core.dtypes import CategoricalDtype
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column, cudf_dtype_to_pa_type
from cudf.utils.dtypes import (
can_convert_to_column,
cudf_dtype_to_pa_type,
)

if TYPE_CHECKING:
from cudf.core.index import Index
Expand Down
45 changes: 35 additions & 10 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,9 @@ def data_array_view(
raise ValueError(f"Unsupported mode: {mode}")
else:
obj = None
return cuda.as_cuda_array(obj).view(self.dtype)
return cuda.as_cuda_array(obj).view(
getattr(self.dtype, "numpy_dtype", self.dtype)
)

def mask_array_view(
self, *, mode: Literal["write", "read"] = "write"
Expand Down Expand Up @@ -1227,6 +1229,7 @@ def __setitem__(self, key: Any, value: Any) -> None:
If ``value`` and ``self`` are of different types, ``value`` is coerced
to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned.
"""
# import pdb;pdb.set_trace()
value_normalized = self._cast_setitem_value(value)
if isinstance(key, slice):
out: ColumnBase | None = self._scatter_by_slice(
Expand Down Expand Up @@ -1633,6 +1636,7 @@ def take(
Skip bounds checking if check_bounds is False.
Set rows to null for all out of bound indices if nullify is `True`.
"""
# import pdb;pdb.set_trace()
# Handle zero size
if indices.size == 0:
return cast(Self, column_empty(row_count=0, dtype=self.dtype))
Expand Down Expand Up @@ -1791,6 +1795,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:

@acquire_spill_lock()
def cast(self, dtype: Dtype) -> ColumnBase:
# import pdb;pdb.set_trace()
result = type(self).from_pylibcudf(
plc.unary.cast(
self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
Expand All @@ -1802,10 +1807,16 @@ def cast(self, dtype: Dtype) -> ColumnBase:
):
result.dtype.precision = dtype.precision # type: ignore[union-attr]
if cudf.get_option("mode.pandas_compatible") and result.dtype != dtype:
if self.dtype.kind == "f" and is_pandas_nullable_extension_dtype(
dtype
):
result = result.set_mask(self.nans_to_nulls().mask)
# result = result.nans_to_nulls()
result._dtype = dtype
return result

def astype(self, dtype: DtypeObj, copy: bool = False) -> ColumnBase:
# import pdb;pdb.set_trace()
if self.dtype == dtype:
result = self
elif len(self) == 0:
Expand Down Expand Up @@ -2246,15 +2257,19 @@ def _return_sentinel_column():
def copy_if_else(
self, other: Self | plc.Scalar, boolean_mask: NumericalColumn
) -> Self:
return type(self).from_pylibcudf( # type: ignore[return-value]
plc.copying.copy_if_else(
self.to_pylibcudf(mode="read"),
other
if isinstance(other, plc.Scalar)
else other.to_pylibcudf(mode="read"),
boolean_mask.to_pylibcudf(mode="read"),
return (
type(self)
.from_pylibcudf( # type: ignore[return-value]
plc.copying.copy_if_else(
self.to_pylibcudf(mode="read"),
other
if isinstance(other, plc.Scalar)
else other.to_pylibcudf(mode="read"),
boolean_mask.to_pylibcudf(mode="read"),
)
)
)
._with_type_metadata(self.dtype)
) # type: ignore[return-value]

def split_by_offsets(
self, offsets: list[int]
Expand Down Expand Up @@ -2948,7 +2963,9 @@ def as_column(
if isinstance(arbitrary, NumpyExtensionArray):
# infer_dtype does not handle NumpyExtensionArray
arbitrary = np.array(arbitrary, dtype=object)
inferred_dtype = infer_dtype(arbitrary)
inferred_dtype = infer_dtype(
arbitrary, skipna=not cudf.get_option("mode.pandas_compatible")
)
if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
raise MixedTypeError("Cannot create column with mixed types")
elif dtype is None and inferred_dtype not in (
Expand Down Expand Up @@ -2984,6 +3001,14 @@ def as_column(
arbitrary,
from_pandas=True,
)
if (
cudf.get_option("mode.pandas_compatible")
and inferred_dtype == "mixed"
and not isinstance(
pyarrow_array.type, (pa.ListType, pa.StructType)
)
):
raise MixedTypeError("Cannot create column with mixed types")
return as_column(
pyarrow_array,
dtype=dtype,
Expand Down
58 changes: 53 additions & 5 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def __invert__(self):
return super().__invert__()

def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
# import pdb;pdb.set_trace()
int_float_dtype_mapping = {
np.int8: np.float32,
np.int16: np.float32,
Expand All @@ -210,12 +211,34 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
np.uint64: np.float64,
np.bool_: np.float32,
}
if cudf.get_option("mode.pandas_compatible"):
int_float_dtype_mapping = {
np.int8: np.float64,
np.int16: np.float64,
np.int32: np.float64,
np.int64: np.float64,
np.uint8: np.float64,
np.uint16: np.float64,
np.uint32: np.float64,
np.uint64: np.float64,
np.bool_: np.float64,
}

# if self.dtype.kind == "b":
# if op.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]:
# # match behavior with non-masked bool dtype
# raise NotImplementedError("Power and division not supported for boolean dtype")
# elif op in ["__sub__", "__rsub__"]:
# # exception message would include "numpy boolean subtract""
# raise TypeError("Cannot subtract boolean dtype")
# return None
out_dtype = None
if op in {"__truediv__", "__rtruediv__"}:
# Division with integer types results in a suitable float.
if truediv_type := int_float_dtype_mapping.get(self.dtype.type):
return self.astype(np.dtype(truediv_type))._binaryop(other, op)
return self.astype(
get_dtype_of_same_kind(self.dtype, np.dtype(truediv_type))
)._binaryop(other, op)
elif op in {
"__lt__",
"__gt__",
Expand Down Expand Up @@ -258,6 +281,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
)

if out_dtype is None:
# import pdb;pdb.set_trace()
out_dtype = find_common_type((self.dtype, other_cudf_dtype))
if op in {"__mod__", "__floordiv__"}:
tmp = self if reflect else other
Expand All @@ -270,6 +294,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
out_dtype = get_dtype_of_same_kind(
out_dtype, np.dtype(np.float64)
)
# elif tmp_dtype.kind == "b":
# out_dtype = get_dtype_of_same_kind(
# out_dtype, np.dtype(np.int8)
# )

if op in {"__and__", "__or__", "__xor__"}:
if self.dtype.kind == "f" or other_cudf_dtype.kind == "f":
Expand All @@ -296,7 +324,19 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
lhs = pa_scalar_to_plc_scalar(lhs)
elif isinstance(rhs, pa.Scalar):
rhs = pa_scalar_to_plc_scalar(rhs)
return binaryop.binaryop(lhs, rhs, op, out_dtype)
res = binaryop.binaryop(lhs, rhs, op, out_dtype)
if op in {"__mod__", "__floordiv__"} and tmp_dtype.kind == "b":
res = res.astype(
get_dtype_of_same_kind(out_dtype, np.dtype(np.int8))
)
elif (
op == "INT_POW"
and res.null_count
and not isinstance(rhs, plc.Scalar)
):
res = res.copy_if_else(lhs, res._get_mask_as_column())
pass
return res

def nans_to_nulls(self: Self) -> Self:
# Only floats can contain nan.
Expand Down Expand Up @@ -338,7 +378,10 @@ def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase:
# => np.int64
# np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8)
# => np.int64
common_dtype = np.result_type(self.dtype, other) # noqa: TID251

common_dtype = np.result_type(
getattr(self.dtype, "numpy_dtype", self.dtype), other
)
if common_dtype.kind in {"b", "i", "u", "f"}:
if self.dtype.kind == "b" and not isinstance(other, bool):
common_dtype = min_signed_type(other)
Expand Down Expand Up @@ -405,16 +448,21 @@ def as_decimal_column(self, dtype: DecimalDtype) -> DecimalBaseColumn:
return self.cast(dtype=dtype) # type: ignore[return-value]

def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
# import pdb;pdb.set_trace()
if dtype == self.dtype:
return self
if cudf.get_option("mode.pandas_compatible"):
if dtype_to_pylibcudf_type(dtype) == dtype_to_pylibcudf_type(
self.dtype
):
if self.dtype.kind == "f":
res = self.nans_to_nulls()
else:
res = self
# Short-circuit the cast if the dtypes are equivalent
# but not the same type object.
self._dtype = dtype
return self
res._dtype = dtype
return res
return self.cast(dtype=dtype) # type: ignore[return-value]

def all(self, skipna: bool = True) -> bool:
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5748,6 +5748,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
1 1 2
2 3 4
"""
# import pdb;pdb.set_trace()
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
Expand Down Expand Up @@ -8708,6 +8709,7 @@ def from_pandas(obj, nan_as_null=no_default):
>>> type(pmidx)
<class 'pandas.core.indexes.multi.MultiIndex'>
"""
# import pdb;pdb.set_trace()
if nan_as_null is no_default:
nan_as_null = (
False if cudf.get_option("mode.pandas_compatible") else None
Expand Down
8 changes: 7 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
find_common_type,
is_dtype_obj_numeric,
is_mixed_with_object_dtype,
is_pandas_nullable_extension_dtype,
)
from cudf.utils.performance_tracking import _performance_tracking
from cudf.utils.utils import _EQUALITY_OPS, _is_same_name
Expand Down Expand Up @@ -205,7 +206,9 @@ def __setitem__(self, key, value):
# In contrast to Column.__setitem__ (which downcasts the value to
# the dtype of the column) here we upcast the series to the
# larger data type mimicking pandas
if not (value is None or value is cudf.NA or value is np.nan):
if not (value is None or value is cudf.NA or value is np.nan) and (
not is_pandas_nullable_extension_dtype(self._frame.dtype)
):
tmp_value = as_column(value)
if tmp_value.dtype.kind in "uifb" and not (
self._frame.dtype.kind == "b"
Expand Down Expand Up @@ -1268,6 +1271,7 @@ def __getitem__(self, arg):

@_performance_tracking
def __setitem__(self, key, value):
# import pdb;pdb.set_trace()
if isinstance(key, slice):
self.iloc[key] = value
else:
Expand Down Expand Up @@ -2926,6 +2930,8 @@ def unique(self):
"""
res = self._column.unique()
if cudf.get_option("mode.pandas_compatible"):
if is_pandas_nullable_extension_dtype(self.dtype):
raise NotImplementedError("cudf does not support arrays")
return res.values
return Series._from_column(res, name=self.name)

Expand Down
Loading