Add support for horizontal string concatenation pl.concat_str (#19142)

Matt711 · web-flow · commit 870c2ffd3afa · 2025-06-18T21:17:28.000Z
Needed to get TPC-DS Query 5 running. Authors: - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #19142
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -12,20 +12,22 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 
+import polars as pl
 from polars.exceptions import InvalidOperationError
 
 import pylibcudf as plc
 
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, DataType
 from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
+from cudf_polars.dsl.utils.reshape import broadcast
 
 if TYPE_CHECKING:
     from typing_extensions import Self
 
     from polars.polars import _expr_nodes as pl_expr
 
-    from cudf_polars.containers import DataFrame, DataType
+    from cudf_polars.containers import DataFrame
 
 __all__ = ["StringFunction"]
 
@@ -110,6 +112,7 @@ def __init__(
 
     def _validate_input(self) -> None:
         if self.name not in (
+            StringFunction.Name.ConcatHorizontal,
             StringFunction.Name.ConcatVertical,
             StringFunction.Name.Contains,
             StringFunction.Name.EndsWith,
@@ -212,7 +215,32 @@ def do_evaluate(
         self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        if self.name is StringFunction.Name.ConcatVertical:
+        if self.name is StringFunction.Name.ConcatHorizontal:
+            columns = [
+                Column(child.evaluate(df, context=context).obj).astype(
+                    DataType(pl.String())
+                )
+                for child in self.children
+            ]
+
+            broadcasted = broadcast(
+                *columns, target_length=max(col.size for col in columns)
+            )
+
+            delimiter, ignore_nulls = self.options
+
+            return Column(
+                plc.strings.combine.concatenate(
+                    plc.Table([col.obj for col in broadcasted]),
+                    plc.Scalar.from_py(delimiter, plc.DataType(plc.TypeId.STRING)),
+                    None
+                    if ignore_nulls
+                    else plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)),
+                    None,
+                    plc.strings.combine.SeparatorOnNulls.NO,
+                )
+            )
+        elif self.name is StringFunction.Name.ConcatVertical:
             (child,) = self.children
             column = child.evaluate(df, context=context)
             delimiter, ignore_nulls = self.options
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -35,6 +35,7 @@
 from cudf_polars.dsl.nodebase import Node
 from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
 from cudf_polars.dsl.tracing import nvtx_annotate_cudf_polars
+from cudf_polars.dsl.utils.reshape import broadcast
 from cudf_polars.dsl.utils.windows import range_window_bounds
 from cudf_polars.utils import dtypes
 from cudf_polars.utils.versions import POLARS_VERSION_LT_128
@@ -80,71 +81,6 @@
 ]
 
 
-def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]:
-    """
-    Broadcast a sequence of columns to a common length.
-
-    Parameters
-    ----------
-    columns
-        Columns to broadcast.
-    target_length
-        Optional length to broadcast to. If not provided, uses the
-        non-unit length of existing columns.
-
-    Returns
-    -------
-    List of broadcasted columns all of the same length.
-
-    Raises
-    ------
-    RuntimeError
-        If broadcasting is not possible.
-
-    Notes
-    -----
-    In evaluation of a set of expressions, polars type-puns length-1
-    columns with scalars. When we insert these into a DataFrame
-    object, we need to ensure they are of equal length. This function
-    takes some columns, some of which may be length-1 and ensures that
-    all length-1 columns are broadcast to the length of the others.
-
-    Broadcasting is only possible if the set of lengths of the input
-    columns is a subset of ``{1, n}`` for some (fixed) ``n``. If
-    ``target_length`` is provided and not all columns are length-1
-    (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
-    """
-    if len(columns) == 0:
-        return []
-    lengths: set[int] = {column.size for column in columns}
-    if lengths == {1}:
-        if target_length is None:
-            return list(columns)
-        nrows = target_length
-    else:
-        try:
-            (nrows,) = lengths.difference([1])
-        except ValueError as e:
-            raise RuntimeError("Mismatching column lengths") from e
-        if target_length is not None and nrows != target_length:
-            raise RuntimeError(
-                f"Cannot broadcast columns of length {nrows=} to {target_length=}"
-            )
-    return [
-        column
-        if column.size != 1
-        else Column(
-            plc.Column.from_scalar(column.obj_scalar, nrows),
-            is_sorted=plc.types.Sorted.YES,
-            order=plc.types.Order.ASCENDING,
-            null_order=plc.types.NullOrder.BEFORE,
-            name=column.name,
-            dtype=column.dtype,
-        )
-        for column in columns
-    ]
-
-
 class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
diff --git a/python/cudf_polars/cudf_polars/dsl/utils/reshape.py b/python/cudf_polars/cudf_polars/dsl/utils/reshape.py
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for reshaping Columns."""
+
+from __future__ import annotations
+
+import pylibcudf as plc
+
+from cudf_polars.containers import Column
+
+
+def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]:
+    """
+    Broadcast a sequence of columns to a common length.
+
+    Parameters
+    ----------
+    columns
+        Columns to broadcast.
+    target_length
+        Optional length to broadcast to. If not provided, uses the
+        non-unit length of existing columns.
+
+    Returns
+    -------
+    List of broadcasted columns all of the same length.
+
+    Raises
+    ------
+    RuntimeError
+        If broadcasting is not possible.
+
+    Notes
+    -----
+    In evaluation of a set of expressions, polars type-puns length-1
+    columns with scalars. When we insert these into a DataFrame
+    object, we need to ensure they are of equal length. This function
+    takes some columns, some of which may be length-1 and ensures that
+    all length-1 columns are broadcast to the length of the others.
+
+    Broadcasting is only possible if the set of lengths of the input
+    columns is a subset of ``{1, n}`` for some (fixed) ``n``. If
+    ``target_length`` is provided and not all columns are length-1
+    (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
+    """
+    if len(columns) == 0:
+        return []
+    lengths: set[int] = {column.size for column in columns}
+    if lengths == {1}:
+        if target_length is None:
+            return list(columns)
+        nrows = target_length
+    else:
+        try:
+            (nrows,) = lengths.difference([1])
+        except ValueError as e:
+            raise RuntimeError("Mismatching column lengths") from e
+        if target_length is not None and nrows != target_length:
+            raise RuntimeError(
+                f"Cannot broadcast columns of length {nrows=} to {target_length=}"
+            )
+    return [
+        column
+        if column.size != 1
+        else Column(
+            plc.Column.from_scalar(column.obj_scalar, nrows),
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
+            name=column.name,
+            dtype=column.dtype,
+        )
+        for column in columns
+    ]
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -498,3 +498,12 @@ def test_string_tail(ldf, tail):
 def test_string_head(ldf, head):
     q = ldf.select(pl.col("a").str.head(head))
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("ignore_nulls", [True, False])
+@pytest.mark.parametrize("separator", ["*", ""])
+def test_concat_horizontal(ldf, ignore_nulls, separator):
+    q = ldf.select(
+        pl.concat_str(["a", "c"], separator=separator, ignore_nulls=ignore_nulls)
+    )
+    assert_gpu_result_equal(q)