Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<regex>: Fix depth-first and leftmost-longest matching rules #5218

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions stl/inc/regex
Original file line number Diff line number Diff line change
@@ -1640,11 +1640,12 @@ public:

if (_Matches) { // copy results to _Matches
_Matches->_Resize(_Get_ncap());
const auto& _Result = _Longest ? _Res : _Tgt_state;
for (unsigned int _Idx = 0; _Idx < _Get_ncap(); ++_Idx) { // copy submatch _Idx
if (_Res._Grp_valid[_Idx]) { // copy successful match
if (_Result._Grp_valid[_Idx]) { // copy successful match
_Matches->_At(_Idx).matched = true;
_Matches->_At(_Idx).first = _Res._Grps[_Idx]._Begin;
_Matches->_At(_Idx).second = _Res._Grps[_Idx]._End;
_Matches->_At(_Idx).first = _Result._Grps[_Idx]._Begin;
_Matches->_At(_Idx).second = _Result._Grps[_Idx]._End;
} else { // copy failed match
_Matches->_At(_Idx).matched = false;
_Matches->_At(_Idx).first = _End;
@@ -3290,6 +3291,20 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_rep(_Node_rep* _Node, bool _Gr
_Psav->_Loop_iter = _STD addressof(_Cur_iter);
_Matched0 = _Match_pat(_Node->_Next);
}
} else if (_Longest) { // longest, try any number of repetitions

// match with no further repetition
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
// match with at least one more repetition if last repetition made progress
if (_Progress) {
_Tgt_state = _St;
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_Cur_iter);

if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true
_Matched0 = true;
}
}
} else if (!_Greedy) { // not greedy, favor minimum number of reps
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
if (!_Matched0 && _Progress) { // tail failed, try another rep
@@ -3450,16 +3465,24 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
}

template <class _BidIt, class _Elem, class _RxTraits, class _It>
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under leftmost-longest rule
for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group
if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) {
// any match (even an empty one) is better than no match at all
if (_Res._Grp_valid[_Ix] != _Tgt_state._Grp_valid[_Ix]) {
return _Tgt_state._Grp_valid[_Ix];
}

if (_Res._Grp_valid[_Ix]) { // now known to be equal to _Tgt_state._Grp_valid[_Ix], no need to test both
// if both groups are matched, prefer the leftmost one
if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) {
return _STD distance(_Begin, _Res._Grps[_Ix]._Begin)
< _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin);
> _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin);
}

// if both groups start at the same position, prefer the longer one
if (_Res._Grps[_Ix]._End != _Tgt_state._Grps[_Ix]._End) {
return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End);
return _STD distance(_Res._Grps[_Ix]._Begin, _Res._Grps[_Ix]._End)
< _STD distance(_Tgt_state._Grps[_Ix]._Begin, _Tgt_state._Grps[_Ix]._End);
}
}
}
@@ -3678,7 +3701,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c
&& _Begin == _Tgt_state._Cur)
|| (_Full && _Tgt_state._Cur != _End)) {
_Failed = true;
} else if (!_Matched || _Better_match()) { // record successful match
} else if (_Longest && (!_Matched || _Better_match())) { // record successful match
_Res = _Tgt_state;
_Matched = true;
}
90 changes: 90 additions & 0 deletions tests/std/include/test_regex_support.hpp
Original file line number Diff line number Diff line change
@@ -2,9 +2,12 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#pragma once
#include <cstddef>
#include <cstdio>
#include <initializer_list>
#include <regex>
#include <string>
#include <utility>

class regex_fixture {
int regex_test_result = 0;
@@ -241,6 +244,93 @@ class test_regex {
fixture->fail_regex();
}
}

void should_search_match_capture_groups(const std::string& subject, const std::string& expected,
const std::regex_constants::match_flag_type match_flags,
std::initializer_list<std::pair<std::ptrdiff_t, std::ptrdiff_t>> capture_groups) const {
std::smatch mr;
try {
const bool search_result = std::regex_search(subject, mr, r, match_flags);
if (!search_result || mr[0] != expected) {
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find "%s", )", subject.c_str(),
pattern.c_str(), static_cast<unsigned int>(syntax), static_cast<unsigned int>(match_flags),
expected.c_str());
if (search_result) {
printf(R"(but it matched "%s")"
"\n",
mr.str().c_str());
} else {
puts("but it failed to match");
}

fixture->fail_regex();
} else if (capture_groups.size() + 1 != mr.size()) {
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to match %zu capture groups in "%s", )",
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
static_cast<unsigned int>(match_flags), capture_groups.size() + 1, expected.c_str());
printf(R"(but it matched %zu groups)"
"\n",
mr.size());
fixture->fail_regex();
} else {
bool submatches_success = true;
for (std::size_t i = 1U; i < mr.size(); ++i) {
const auto& expected_capture = capture_groups.begin()[i - 1];
const auto& actual_capture = mr[i];
if (expected_capture.first == -1) {
if (actual_capture.matched) {
submatches_success = false;
break;
}
} else if (!actual_capture.matched || actual_capture.first != (mr[0].first + expected_capture.first)
|| actual_capture.second != (mr[0].first + expected_capture.second)) {
submatches_success = false;
break;
}
}
if (!submatches_success) {
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find capture groups {)",
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
static_cast<unsigned int>(match_flags));

bool initial = true;
for (const auto& expected_capture : capture_groups) {
std::string capture = "(unmatched)";
if (expected_capture.first != -1) {
capture.assign(mr[0].first + expected_capture.first, mr[0].first + expected_capture.second);
}
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), expected_capture.first,
expected_capture.second);
initial = false;
}
printf(R"(} in "%s", but found {)", expected.c_str());

initial = true;
for (std::size_t i = 1U; i < mr.size(); ++i) {
const auto& actual_capture = mr[i];
std::string capture = "(unmatched)";
std::ptrdiff_t first = -1;
std::ptrdiff_t last = -1;
if (actual_capture.matched) {
capture = actual_capture.str();
first = actual_capture.first - mr[0].first;
last = actual_capture.second - mr[0].first;
}
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), first, last);
initial = false;
}
printf("}\n");
fixture->fail_regex();
}
}
} catch (const std::regex_error& e) {
printf(R"(Failed to regex_search("%s", regex("%s", 0x%X), 0x%X): regex_error: "%s")"
"\n",
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
static_cast<unsigned int>(match_flags), e.what());
fixture->fail_regex();
}
}
};

class test_wregex {
79 changes: 79 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
@@ -558,6 +558,84 @@ void test_construction_from_nullptr_and_zero() {
}
}

void test_gh_731() {
// GH-731 <regex>: Incorrect behavior for capture groups
// GH-996: regex_search behaves incorrectly when the regex contains R"(\[)"

// Several bugs were fixed in ECMAScript (depth-first) and POSIX (leftmost-longest) matching rules.
{
const test_regex ecma_regex(&g_regexTester, R"((A+)\s*(B+)?\s*B*)", ECMAScript);
ecma_regex.should_search_match_capture_groups("AAA BBB", "AAA BBB", match_default, {{0, 3}, {4, 7}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, R"((A+)[[:space:]]*(B+)?[[:space:]]*B*)", option);
posix_regex.should_search_match_capture_groups("AAA BBB", "AAA BBB", match_default, {{0, 3}, {4, 7}});
}

{
const test_regex ecma_regex(&g_regexTester, ".*(cat|concatenate)", ECMAScript);
ecma_regex.should_search_match_capture_groups("WXconcatenateYZ", "WXconcat", match_default, {{5, 8}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, ".*(cat|concatenate)", option);
posix_regex.should_search_match_capture_groups("WXconcatenateYZ", "WXconcatenate", match_default, {{2, 13}});
}

{
const test_regex ecma_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", ECMAScript);
ecma_regex.should_search_match_capture_groups("aabaac", "aaba", match_default, {{2, 4}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", option);
posix_regex.should_search_match_capture_groups("aabaac", "aabaac", match_default, {{0, 6}});
}

{
const test_regex ecma_regex(&g_regexTester, ".*(a|bacc|baccc)", ECMAScript);
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddba", match_default, {{3, 4}});
}
{
const test_regex ecma_regex(&g_regexTester, ".*?(a|bacc|baccc)", ECMAScript);
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddbacc", match_default, {{2, 6}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, ".*(a|bacc|baccc)", option);
posix_regex.should_search_match_capture_groups("ddbacccd", "ddbaccc", match_default, {{2, 7}});
}

{
const test_regex ecma_regex(&g_regexTester, R"(^[[:blank:]]*#([^\n]*\\[[:space:]]+)*[^\n]*)", ECMAScript);
ecma_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{30, 42}});
}
{
const test_regex awk_regex(&g_regexTester, R"(^[[:blank:]]*#([^\n]*\\[[:space:]]+)*[^\n]*)", awk);
awk_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{28, 42}});
}
{
const test_regex extended_regex(&g_regexTester, "^[[:blank:]]*#([^\n]*\\\\[[:space:]]+)*[^\n]*", extended);
extended_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{28, 42}});
}

{
const test_regex ecma_regex(&g_regexTester, "(ab*)*(ce|bbceef)", ECMAScript);
ecma_regex.should_search_match_capture_groups("aababbbceef", "aababbbce", match_default, {{3, 7}, {7, 9}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, "(ab*)*(ce|bbceef)", option);
posix_regex.should_search_match_capture_groups("aababbbceef", "aababbbceef", match_default, {{3, 5}, {5, 11}});
}

{
// GH-996 test case
const test_regex ecma_regex(&g_regexTester, R"( *((<<)|(\[)|(.+)))", ECMAScript);
ecma_regex.should_search_match_capture_groups(
" [<</Category/Export>>]>>", " [", match_default, {{1, 2}, {-1, -1}, {1, 2}, {-1, -1}});
}
}

void test_gh_993() {
// GH-993 regex::icase is not handled correctly for some input.
{
@@ -775,6 +853,7 @@ int main() {
test_VSO_225160_match_eol_flag();
test_VSO_226914_word_boundaries();
test_construction_from_nullptr_and_zero();
test_gh_731();
test_gh_993();
test_gh_4995();
test_gh_5058();
6 changes: 4 additions & 2 deletions tests/tr1/tests/regex2/test.cpp
Original file line number Diff line number Diff line change
@@ -659,7 +659,7 @@ static const regex_test tests[] = {
{__LINE__, T("a[a-z]\\{2,4\\}"), T("abcdefghi"), "1 0 5", BASIC | GREP},
{__LINE__, T("a[a-z]{2,4}?"), T("abcdefghi"), "1 0 3", ECMA},
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 4 2 4", ECMA},
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 6 5 6", EEA},
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 6 0 6", EEA},
{__LINE__, T("(z)((a+)?(b+)?(c))*"), T("zaacbbbcac"), "6 0 10 0 1 8 10 8 9 -1 -1 9 10", ECMA},
{__LINE__, T("(a*)b\\1+"), T("baaaac"), "2 0 1 0 0", ECMA},
{__LINE__, T("(?=(a+))"), T("baaabac"), "2 1 1 1 4", ECMA},
@@ -774,7 +774,9 @@ static const regex_test tests[] = {
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"), T("#define some_symbol(x) #x"), "2 0 25 -1 -1",
ECMA | AWK},
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"),
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 30 42", ECMA | AWK},
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 30 42", ECMA},
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"),
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 28 42", AWK},
};

static STD string check_matches(