diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 2571a4fd589..8de6e18fa1a 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -240,9 +240,8 @@ std::unique_ptr join_data(q9_data const& data) // Generating the `profit` table // Filter the part table using `p_name like '%green%'` - auto const p_name = data.part->table().column(1); - auto const mask = - cudf::strings::like(cudf::strings_column_view(p_name), cudf::string_scalar("%green%")); + auto const p_name = data.part->table().column(1); + auto const mask = cudf::strings::like(cudf::strings_column_view(p_name), "%green%"); auto const part_filtered = apply_mask(data.part, mask); // Perform the joins diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp index 43a8b5cf8aa..a386cd9de0a 100644 --- a/cpp/include/cudf/strings/contains.hpp +++ b/cpp/include/cudf/strings/contains.hpp @@ -136,7 +136,9 @@ std::unique_ptr count_re( * * Any null string entries return corresponding null output column entries. * - * @throw std::invalid_argument if `pattern` or `escape_character` is invalid + * The `pattern` and `escape_character` parameter must be valid until a + * synchronize is performed on the given `stream` parameter. + * * @throw std::invalid_argument if `escape_character` contains more than on byte * * @param input Strings instance for this operation @@ -148,6 +150,31 @@ std::unique_ptr count_re( * @return New boolean column */ std::unique_ptr like( + strings_column_view const& input, + std::string_view const& pattern, + std::string_view const& escape_character = "", + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns a boolean column identifying rows which + * match the given like pattern + * + * @deprecated in 25.12 and to be removed in a future release. Use like(strings_column_view, + * std::string_view, std::string_view, rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + * @throw std::invalid_argument if `pattern` or `escape_character` is invalid + * @throw std::invalid_argument if `escape_character` contains more than on byte + * + * @param input Strings instance for this operation + * @param pattern Like pattern to match within each string + * @param escape_character Optional character specifies the escape prefix. + * Default is no escape character. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New boolean column + */ +[[deprecated]] std::unique_ptr like( strings_column_view const& input, string_scalar const& pattern, string_scalar const& escape_character = string_scalar(""), diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu index 4c07a51bd57..a363536739a 100644 --- a/cpp/src/strings/like.cu +++ b/cpp/src/strings/like.cu @@ -366,6 +366,17 @@ std::unique_ptr like(strings_column_view const& input, return like(input, patterns_itr, d_escape, stream, mr); } +std::unique_ptr like(strings_column_view const& input, + std::string_view const& pattern, + std::string_view const& escape_character, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const ptn = string_scalar(pattern, true, stream); + auto const esc = string_scalar(escape_character, true, stream); + return like(input, ptn, esc, stream, mr); +} + std::unique_ptr like(strings_column_view const& input, strings_column_view const& patterns, string_scalar const& escape_character, @@ -406,6 +417,16 @@ std::unique_ptr like(strings_column_view const& input, return detail::like(input, pattern, escape_character, stream, mr); } +std::unique_ptr like(strings_column_view const& input, + std::string_view const& pattern, + std::string_view const& escape_character, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::like(input, pattern, escape_character, stream, mr); +} + std::unique_ptr like(strings_column_view const& input, strings_column_view const& patterns, string_scalar const& escape_character, diff --git a/cpp/tests/streams/strings/contains_test.cpp b/cpp/tests/streams/strings/contains_test.cpp index 56defc97aa4..177f13cdce5 100644 --- a/cpp/tests/streams/strings/contains_test.cpp +++ b/cpp/tests/streams/strings/contains_test.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -31,11 +31,12 @@ TEST_F(StringsContainsTest, Like) auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""}); auto view = cudf::strings_column_view(input); - auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream()); - auto const escape = cudf::string_scalar("%", true, cudf::test::get_default_stream()); + auto const pattern = std::string_view("%és"); + auto const escape = std::string_view("%"); cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream()); + auto const s_escape = cudf::string_scalar(escape, true, cudf::test::get_default_stream()); auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""}); cudf::strings::like( - view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream()); + view, cudf::strings_column_view(patterns), s_escape, cudf::test::get_default_stream()); } diff --git a/cpp/tests/strings/like_tests.cpp b/cpp/tests/strings/like_tests.cpp index 73590834f25..6d13dc82aea 100644 --- a/cpp/tests/strings/like_tests.cpp +++ b/cpp/tests/strings/like_tests.cpp @@ -346,18 +346,14 @@ TEST_F(StringsLikeTests, AllNulls) TEST_F(StringsLikeTests, Errors) { - auto const input = cudf::test::strings_column_wrapper({"3", "33"}); - auto const sv = cudf::strings_column_view(input); - auto const invalid_str = cudf::string_scalar("", false); + auto const input = cudf::test::strings_column_wrapper({"3", "33"}); + auto const sv = cudf::strings_column_view(input); - EXPECT_THROW(cudf::strings::like(sv, invalid_str), std::invalid_argument); - EXPECT_THROW(cudf::strings::like(sv, std::string_view("3"), invalid_str), std::invalid_argument); EXPECT_THROW(cudf::strings::like(sv, std::string_view("3"), std::string_view("ee")), std::invalid_argument); auto patterns = cudf::test::strings_column_wrapper({"3", ""}, {true, false}); auto const sv_patterns = cudf::strings_column_view(patterns); EXPECT_THROW(cudf::strings::like(sv, sv_patterns), std::invalid_argument); - EXPECT_THROW(cudf::strings::like(sv, sv, invalid_str), std::invalid_argument); EXPECT_THROW(cudf::strings::like(sv, sv_patterns, std::string_view("ee")), std::invalid_argument); } diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c86d9ce8054..fdbe5e1662a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1347,8 +1347,8 @@ def str_contains(self, pattern: str | Self) -> Self: def like(self, pattern: str, escape: str) -> Self: plc_column = plc.strings.contains.like( self.to_pylibcudf(mode="read"), - pa_scalar_to_plc_scalar(pa.scalar(pattern)), - pa_scalar_to_plc_scalar(pa.scalar(escape)), + pattern, + escape, ) return ( type(self) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index 133c1459845..0618d3f0ff2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -32,8 +33,8 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] like( column_view source_strings, - string_scalar pattern, - string_scalar escape_character, + string pattern, + string escape_character, cuda_stream_view stream, device_memory_resource* mr) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd index 07775b3f28e..b3b0f06efb5 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/strings/contains.pxd @@ -2,14 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from pylibcudf.column cimport Column -from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource from rmm.pylibrmm.stream cimport Stream -ctypedef fused ColumnOrScalar: - Column - Scalar cpdef Column contains_re( Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=* @@ -25,8 +21,8 @@ cpdef Column matches_re( cpdef Column like( Column input, - ColumnOrScalar pattern, - Scalar escape_character=*, + str pattern, + str escape_character=*, Stream stream=*, DeviceMemoryResource mr=*, ) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi index 417214e5adc..3685cf5345a 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyi +++ b/python/pylibcudf/pylibcudf/strings/contains.pyi @@ -5,7 +5,6 @@ from rmm.pylibrmm.memory_resource import DeviceMemoryResource from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column -from pylibcudf.scalar import Scalar from pylibcudf.strings.regex_program import RegexProgram def contains_re( @@ -28,8 +27,8 @@ def matches_re( ) -> Column: ... def like( input: Column, - pattern: Column | Scalar, - escape_character: Scalar | None = None, + pattern: str, + escape_character: str | None = None, stream: Stream | None = None, mr: DeviceMemoryResource | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 246d6b4550c..954fee3880d 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -2,17 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cython.operator import dereference +from libcpp.string cimport string from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.scalar.scalar_factories cimport ( - make_string_scalar as cpp_make_string_scalar, -) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram -from pylibcudf.scalar cimport Scalar from pylibcudf.utils cimport _get_stream, _get_memory_resource from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource from rmm.pylibrmm.stream cimport Stream @@ -139,8 +134,8 @@ cpdef Column matches_re( cpdef Column like( Column input, - ColumnOrScalar pattern, - Scalar escape_character=None, + str pattern, + str escape_character=None, Stream stream=None, DeviceMemoryResource mr=None, ): @@ -154,9 +149,9 @@ cpdef Column like( ---------- input : Column The input strings - pattern : Column or Scalar - Like patterns to match within each string - escape_character : Scalar + pattern : str + Like pattern to match within each string + escape_character : str Optional character specifies the escape prefix. Default is no escape character. @@ -170,35 +165,18 @@ cpdef Column like( mr = _get_memory_resource(mr) if escape_character is None: - escape_character = Scalar.from_libcudf( - cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr()) - ) + escape_character = "" + + cdef string c_escape_character = escape_character.encode() + cdef string c_pattern = pattern.encode() - cdef const string_scalar* c_escape_character = ( - escape_character.c_obj.get() - ) - cdef const string_scalar* c_pattern - - if ColumnOrScalar is Column: - with nogil: - result = cpp_contains.like( - input.view(), - pattern.view(), - dereference(c_escape_character), - stream.view(), - mr.get_mr() - ) - elif ColumnOrScalar is Scalar: - c_pattern = (pattern.c_obj.get()) - with nogil: - result = cpp_contains.like( - input.view(), - dereference(c_pattern), - dereference(c_escape_character), - stream.view(), - mr.get_mr() - ) - else: - raise ValueError("pattern must be a Column or a Scalar") + with nogil: + result = cpp_contains.like( + input.view(), + c_pattern, + c_escape_character, + stream.view(), + mr.get_mr() + ) return Column.from_libcudf(move(result), stream, mr) diff --git a/python/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/tests/test_string_contains.py index 9de028e1209..9807e78bf68 100644 --- a/python/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/tests/test_string_contains.py @@ -81,7 +81,7 @@ def test_like(): arr = pa.array(["1a2aa3aaa"]) got = plc.strings.contains.like( plc.Column.from_arrow(arr), - plc.Column.from_arrow(pa.array([pattern])), + pattern, ) expect = pc.match_like(arr, pattern) assert_column_eq(expect, got)