Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions cpp/benchmarks/ndsh/q09.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,8 @@ std::unique_ptr<table_with_names> join_data(q9_data const& data)

// Generating the `profit` table
// Filter the part table using `p_name like '%green%'`
auto const p_name = data.part->table().column(1);
auto const mask =
cudf::strings::like(cudf::strings_column_view(p_name), cudf::string_scalar("%green%"));
auto const p_name = data.part->table().column(1);
auto const mask = cudf::strings::like(cudf::strings_column_view(p_name), "%green%");
auto const part_filtered = apply_mask(data.part, mask);

// Perform the joins
Expand Down
29 changes: 28 additions & 1 deletion cpp/include/cudf/strings/contains.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ std::unique_ptr<column> count_re(
*
* Any null string entries return corresponding null output column entries.
*
* @throw std::invalid_argument if `pattern` or `escape_character` is invalid
* The `pattern` and `escape_character` parameter must be valid until a
* synchronize is performed on the given `stream` parameter.
*
* @throw std::invalid_argument if `escape_character` contains more than on byte
*
* @param input Strings instance for this operation
Expand All @@ -148,6 +150,31 @@ std::unique_ptr<column> count_re(
* @return New boolean column
*/
std::unique_ptr<column> like(
strings_column_view const& input,
std::string_view const& pattern,
std::string_view const& escape_character = "",
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns a boolean column identifying rows which
* match the given like pattern
*
* @deprecated in 25.12 and to be removed in a future release. Use like(strings_column_view,
* std::string_view, std::string_view, rmm::cuda_stream_view, rmm::device_async_resource_ref)
*
* @throw std::invalid_argument if `pattern` or `escape_character` is invalid
* @throw std::invalid_argument if `escape_character` contains more than on byte
*
* @param input Strings instance for this operation
* @param pattern Like pattern to match within each string
* @param escape_character Optional character specifies the escape prefix.
* Default is no escape character.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New boolean column
*/
[[deprecated]] std::unique_ptr<column> like(
strings_column_view const& input,
string_scalar const& pattern,
string_scalar const& escape_character = string_scalar(""),
Expand Down
21 changes: 21 additions & 0 deletions cpp/src/strings/like.cu
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,17 @@ std::unique_ptr<column> like(strings_column_view const& input,
return like(input, patterns_itr, d_escape, stream, mr);
}

std::unique_ptr<column> like(strings_column_view const& input,
std::string_view const& pattern,
std::string_view const& escape_character,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto const ptn = string_scalar(pattern, true, stream);
auto const esc = string_scalar(escape_character, true, stream);
Comment on lines +375 to +376
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stream-order/lifetime question. How do we know the string_view lifetime is long enough to do the stream-ordered copy to the _data buffer without inserting a sync or copying to an internal host buffer first? What guarantees safety here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. Technically the output is not guaranteed to be ready unless the caller does sync so either I could add a sync or just add a comment to the doxygen indicating the parameter must stay alive until the caller does a sync on the input stream.

return like(input, ptn, esc, stream, mr);
}

std::unique_ptr<column> like(strings_column_view const& input,
strings_column_view const& patterns,
string_scalar const& escape_character,
Expand Down Expand Up @@ -406,6 +417,16 @@ std::unique_ptr<column> like(strings_column_view const& input,
return detail::like(input, pattern, escape_character, stream, mr);
}

std::unique_ptr<column> like(strings_column_view const& input,
std::string_view const& pattern,
std::string_view const& escape_character,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::like(input, pattern, escape_character, stream, mr);
}

std::unique_ptr<column> like(strings_column_view const& input,
strings_column_view const& patterns,
string_scalar const& escape_character,
Expand Down
9 changes: 5 additions & 4 deletions cpp/tests/streams/strings/contains_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand Down Expand Up @@ -31,11 +31,12 @@ TEST_F(StringsContainsTest, Like)
auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""});
auto view = cudf::strings_column_view(input);

auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream());
auto const escape = cudf::string_scalar("%", true, cudf::test::get_default_stream());
auto const pattern = std::string_view("%és");
auto const escape = std::string_view("%");
cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream());

auto const s_escape = cudf::string_scalar(escape, true, cudf::test::get_default_stream());
auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""});
cudf::strings::like(
view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream());
view, cudf::strings_column_view(patterns), s_escape, cudf::test::get_default_stream());
}
8 changes: 2 additions & 6 deletions cpp/tests/strings/like_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,18 +346,14 @@ TEST_F(StringsLikeTests, AllNulls)

TEST_F(StringsLikeTests, Errors)
{
auto const input = cudf::test::strings_column_wrapper({"3", "33"});
auto const sv = cudf::strings_column_view(input);
auto const invalid_str = cudf::string_scalar("", false);
auto const input = cudf::test::strings_column_wrapper({"3", "33"});
auto const sv = cudf::strings_column_view(input);

EXPECT_THROW(cudf::strings::like(sv, invalid_str), std::invalid_argument);
EXPECT_THROW(cudf::strings::like(sv, std::string_view("3"), invalid_str), std::invalid_argument);
EXPECT_THROW(cudf::strings::like(sv, std::string_view("3"), std::string_view("ee")),
std::invalid_argument);

auto patterns = cudf::test::strings_column_wrapper({"3", ""}, {true, false});
auto const sv_patterns = cudf::strings_column_view(patterns);
EXPECT_THROW(cudf::strings::like(sv, sv_patterns), std::invalid_argument);
EXPECT_THROW(cudf::strings::like(sv, sv, invalid_str), std::invalid_argument);
EXPECT_THROW(cudf::strings::like(sv, sv_patterns, std::string_view("ee")), std::invalid_argument);
}
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,8 +1347,8 @@ def str_contains(self, pattern: str | Self) -> Self:
def like(self, pattern: str, escape: str) -> Self:
plc_column = plc.strings.contains.like(
self.to_pylibcudf(mode="read"),
pa_scalar_to_plc_scalar(pa.scalar(pattern)),
pa_scalar_to_plc_scalar(pa.scalar(escape)),
pattern,
escape,
)
return (
type(self)
Expand Down
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from pylibcudf.exception_handler cimport libcudf_exception_handler
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
Expand Down Expand Up @@ -32,8 +33,8 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] like(
column_view source_strings,
string_scalar pattern,
string_scalar escape_character,
string pattern,
string escape_character,
cuda_stream_view stream,
device_memory_resource* mr) except +libcudf_exception_handler

Expand Down
8 changes: 2 additions & 6 deletions python/pylibcudf/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@
# SPDX-License-Identifier: Apache-2.0

from pylibcudf.column cimport Column
from pylibcudf.scalar cimport Scalar
from pylibcudf.strings.regex_program cimport RegexProgram
from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
from rmm.pylibrmm.stream cimport Stream

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column contains_re(
Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=*
Expand All @@ -25,8 +21,8 @@ cpdef Column matches_re(

cpdef Column like(
Column input,
ColumnOrScalar pattern,
Scalar escape_character=*,
str pattern,
str escape_character=*,
Stream stream=*,
DeviceMemoryResource mr=*,
)
5 changes: 2 additions & 3 deletions python/pylibcudf/pylibcudf/strings/contains.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ from rmm.pylibrmm.memory_resource import DeviceMemoryResource
from rmm.pylibrmm.stream import Stream

from pylibcudf.column import Column
from pylibcudf.scalar import Scalar
from pylibcudf.strings.regex_program import RegexProgram

def contains_re(
Expand All @@ -28,8 +27,8 @@ def matches_re(
) -> Column: ...
def like(
input: Column,
pattern: Column | Scalar,
escape_character: Scalar | None = None,
pattern: str,
escape_character: str | None = None,
stream: Stream | None = None,
mr: DeviceMemoryResource | None = None,
) -> Column: ...
58 changes: 18 additions & 40 deletions python/pylibcudf/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,12 @@
# SPDX-License-Identifier: Apache-2.0
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cython.operator import dereference
from libcpp.string cimport string

from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from pylibcudf.libcudf.strings cimport contains as cpp_contains
from pylibcudf.strings.regex_program cimport RegexProgram
from pylibcudf.scalar cimport Scalar
from pylibcudf.utils cimport _get_stream, _get_memory_resource
from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
from rmm.pylibrmm.stream cimport Stream
Expand Down Expand Up @@ -139,8 +134,8 @@ cpdef Column matches_re(

cpdef Column like(
Column input,
ColumnOrScalar pattern,
Scalar escape_character=None,
str pattern,
str escape_character=None,
Stream stream=None,
DeviceMemoryResource mr=None,
):
Expand All @@ -154,9 +149,9 @@ cpdef Column like(
----------
input : Column
The input strings
pattern : Column or Scalar
Like patterns to match within each string
escape_character : Scalar
pattern : str
Like pattern to match within each string
escape_character : str
Optional character specifies the escape prefix.
Default is no escape character.

Expand All @@ -170,35 +165,18 @@ cpdef Column like(
mr = _get_memory_resource(mr)

if escape_character is None:
escape_character = Scalar.from_libcudf(
cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
)
escape_character = ""

cdef string c_escape_character = escape_character.encode()
cdef string c_pattern = pattern.encode()

cdef const string_scalar* c_escape_character = <const string_scalar*>(
escape_character.c_obj.get()
)
cdef const string_scalar* c_pattern

if ColumnOrScalar is Column:
with nogil:
result = cpp_contains.like(
input.view(),
pattern.view(),
dereference(c_escape_character),
stream.view(),
mr.get_mr()
)
elif ColumnOrScalar is Scalar:
c_pattern = <const string_scalar*>(pattern.c_obj.get())
with nogil:
result = cpp_contains.like(
input.view(),
dereference(c_pattern),
dereference(c_escape_character),
stream.view(),
mr.get_mr()
)
else:
raise ValueError("pattern must be a Column or a Scalar")
with nogil:
result = cpp_contains.like(
input.view(),
c_pattern,
c_escape_character,
stream.view(),
mr.get_mr()
)

return Column.from_libcudf(move(result), stream, mr)
2 changes: 1 addition & 1 deletion python/pylibcudf/tests/test_string_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_like():
arr = pa.array(["1a2aa3aaa"])
got = plc.strings.contains.like(
plc.Column.from_arrow(arr),
plc.Column.from_arrow(pa.array([pattern])),
pattern,
)
expect = pc.match_like(arr, pattern)
assert_column_eq(expect, got)
Loading