-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix better_match check used by regex search routine #1547
Changes from all commits
f45ed70
e7bec21
af26bd4
7cf7f37
de9c799
2f37a47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3398,19 +3398,48 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap | |
} | ||
|
||
template <class _BidIt, class _Elem, class _RxTraits, class _It> | ||
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules | ||
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { | ||
// check for better match under the rules of the current regex implementation | ||
for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group | ||
if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) { | ||
if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) { | ||
return _STD distance(_Begin, _Res._Grps[_Ix]._Begin) | ||
< _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin); | ||
const auto& _Res_grp = _Res._Grps[_Ix]; | ||
const auto& _Tgt_grp = _Tgt_state._Grps[_Ix]; | ||
if (_Res_grp._Begin == _Res_grp._End) { | ||
if (_Tgt_grp._Begin == _Tgt_grp._End) { | ||
// Both groups are empty or invalid - matches are equivalent. | ||
continue; | ||
} | ||
// Target group is valid and non-empty, and current group is either empty or invalid. | ||
return true; | ||
} | ||
if (_Tgt_grp._Begin == _Tgt_grp._End) { | ||
// Target group is invalid or empty, and current group is valid and non-empty, | ||
// so our current group is the better match. | ||
return false; | ||
} | ||
|
||
if (_Res._Grps[_Ix]._End != _Tgt_state._Grps[_Ix]._End) { | ||
return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End); | ||
if (_Res_grp._Begin != _Tgt_grp._Begin) { | ||
ptrdiff_t _Res_dist_from_begin = _STD distance(_Begin, _Res_grp._Begin); | ||
ptrdiff_t _Tgt_dist_from_begin = _STD distance(_Begin, _Tgt_grp._Begin); | ||
if (_Sflags & regex_constants::ECMAScript) { | ||
// If one of the matches starts before the other, in ECMAScript, it is considered better. | ||
// This is also known as "leftmost", in languages written from left to right. | ||
return _Tgt_dist_from_begin < _Res_dist_from_begin; | ||
} else { | ||
// Otherwise, Unix uses "rightmost" rules. | ||
strega-nil-ms marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return _Res_dist_from_begin < _Tgt_dist_from_begin; | ||
} | ||
} | ||
|
||
if (_Res_grp._End != _Tgt_grp._End) { | ||
// If one of the matches is longer than the other, and they start at the same point, | ||
// it is considered better. | ||
ptrdiff_t _Res_size = _STD distance(_Res_grp._Begin, _Res_grp._End); | ||
ptrdiff_t _Tgt_size = _STD distance(_Tgt_grp._Begin, _Tgt_grp._End); | ||
return _Tgt_size > _Res_size; | ||
} | ||
Comment on lines
+3420
to
+3439
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this logic is incorrect (although I don't immediately know what all of the problems are). First, although it is not an authoritative reference, https://en.cppreference.com/w/cpp/regex/syntax_option_type describes POSIX as "leftmost longest". I'm surprised to see "rightmost" mentioned in this implementation. (From what I've read, ECMAScript is not "leftmost longest".) Boost explains the Perl/ECMAScript behavior at https://www.boost.org/doc/libs/1_80_0/libs/regex/doc/html/boost_regex/syntax/perl_syntax.html#boost_regex.syntax.perl_syntax.what_gets_matched and the POSIX behavior at https://www.boost.org/doc/libs/1_80_0/libs/regex/doc/html/boost_regex/syntax/leftmost_longest_rule.html . Second, the cppreference article depicts the output:
However, both
In this example with this PR, the final (As an aside, this PR's logic is structured and commented in a clear, comprehensible way, which is great! My only concern is matching what the grammars specify.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, yeah, I apparently misunderstood something as I was cleaning up the logic; I think the existing logic was worse, but yeah; not sure what the best way forward is. |
||
} | ||
|
||
// Use the current match if target isn't better. | ||
return false; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,6 +116,59 @@ class regex_fixture { | |
} | ||
} | ||
|
||
void should_capture_groups(const std::string& subject, const std::string& pattern, | ||
const std::vector<std::string>& groups, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to include |
||
const std::regex_constants::syntax_option_type flags = std::regex_constants::ECMAScript) { | ||
try { | ||
const std::regex r(pattern, flags); | ||
std::smatch m; | ||
|
||
if (!std::regex_match(subject, m, r)) { | ||
printf(R"(Expected regex("%s") to match "%s".)" | ||
"\n", | ||
pattern.c_str(), subject.c_str()); | ||
fail_regex(); | ||
return; | ||
} | ||
|
||
if (m[0] != subject) { | ||
printf(R"(should_capture("%s", "%s"): not a full match)" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Occurs below) These messages are saying |
||
"\nexpected: \"%s\"" | ||
"\nfound: \"%s\"\n", | ||
subject.c_str(), pattern.c_str(), subject.c_str(), m[0].str().c_str()); | ||
fail_regex(); | ||
} | ||
|
||
if (m.size() != groups.size() + 1) { | ||
printf(R"(should_capture("%s", "%s"): captures: %zd != %zd)" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
"\n", | ||
subject.c_str(), pattern.c_str(), m.size() - 1, groups.size()); | ||
fail_regex(); | ||
} | ||
|
||
for (size_t i = 0; i < groups.size(); ++i) { | ||
if (m[i + 1] != groups[i]) { | ||
printf(R"(should_capture("%s", "%s"): index %zd)" | ||
"\nexpected: \"%s\"" | ||
"\nfound: \"%s\"\n", | ||
subject.c_str(), pattern.c_str(), i, groups[i].c_str(), m[i + 1].str().c_str()); | ||
fail_regex(); | ||
} | ||
} | ||
} catch (const std::regex_error& e) { | ||
std::string groups_string = "|"; | ||
for (const auto& el : groups) { | ||
groups_string.append(el); | ||
groups_string.push_back('|'); | ||
} | ||
Comment on lines
+159
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: This seems confusing, because it doesn't look like the callsites. Would formatting the groups as |
||
|
||
printf(R"(should_capture("%s", "%s", "%s"): regex_error: "%s")" | ||
"\n", | ||
subject.c_str(), pattern.c_str(), groups_string.c_str(), e.what()); | ||
fail_regex(); | ||
} | ||
} | ||
|
||
void should_replace_to(const std::string& subject, const std::string& pattern, const std::string& fmt, | ||
const std::regex_constants::match_flag_type match_flags, const std::string& expected) { | ||
try { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -546,6 +546,17 @@ void test_VSO_226914_word_boundaries() { | |
aWordAny.should_search_fail("aa", match_not_bow | match_not_eow); | ||
} | ||
|
||
void test_gh_731() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be followed by a comment mentioning the bug's title (like |
||
std::vector<std::string> groups{"AAA", "BBB", ""}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is already There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should probably include |
||
g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*(B*)?)", groups); | ||
|
||
groups.assign({"AAA", "BBB"}); | ||
g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*)", groups); | ||
|
||
groups.assign({"c"}); | ||
g_regexTester.should_capture_groups("aabaac", "(aa|aabaac|ba|b|c)*", groups, std::regex_constants::extended); | ||
Comment on lines
+550
to
+557
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need additional test coverage to exercise |
||
} | ||
|
||
void test_gh_993() { | ||
// GH-993 regex::icase is not handled correctly for some input. | ||
{ | ||
|
@@ -607,6 +618,7 @@ int main() { | |
test_VSO_225160_match_bol_flag(); | ||
test_VSO_225160_match_eol_flag(); | ||
test_VSO_226914_word_boundaries(); | ||
test_gh_731(); | ||
test_gh_993(); | ||
|
||
return g_regexTester.result(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We conventionally add newlines between "non-chained"
if
-statements.