diff --git a/stl/inc/regex b/stl/inc/regex index 8747ea9ed7..4b9faec9da 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3398,19 +3398,48 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap } template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules +bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { + // check for better match under the rules of the current regex implementation for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group - if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) { - if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) { - return _STD distance(_Begin, _Res._Grps[_Ix]._Begin) - < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin); + const auto& _Res_grp = _Res._Grps[_Ix]; + const auto& _Tgt_grp = _Tgt_state._Grps[_Ix]; + if (_Res_grp._Begin == _Res_grp._End) { + if (_Tgt_grp._Begin == _Tgt_grp._End) { + // Both groups are empty or invalid - matches are equivalent. + continue; } + // Target group is valid and non-empty, and current group is either empty or invalid. + return true; + } + if (_Tgt_grp._Begin == _Tgt_grp._End) { + // Target group is invalid or empty, and current group is valid and non-empty, + // so our current group is the better match. + return false; + } - if (_Res._Grps[_Ix]._End != _Tgt_state._Grps[_Ix]._End) { - return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End); + if (_Res_grp._Begin != _Tgt_grp._Begin) { + ptrdiff_t _Res_dist_from_begin = _STD distance(_Begin, _Res_grp._Begin); + ptrdiff_t _Tgt_dist_from_begin = _STD distance(_Begin, _Tgt_grp._Begin); + if (_Sflags & regex_constants::ECMAScript) { + // If one of the matches starts before the other, in ECMAScript, it is considered better. + // This is also known as "leftmost", in languages written from left to right. + return _Tgt_dist_from_begin < _Res_dist_from_begin; + } else { + // Otherwise, Unix uses "rightmost" rules. + return _Res_dist_from_begin < _Tgt_dist_from_begin; } } + + if (_Res_grp._End != _Tgt_grp._End) { + // If one of the matches is longer than the other, and they start at the same point, + // it is considered better. + ptrdiff_t _Res_size = _STD distance(_Res_grp._Begin, _Res_grp._End); + ptrdiff_t _Tgt_size = _STD distance(_Tgt_grp._Begin, _Tgt_grp._End); + return _Tgt_size > _Res_size; + } } + + // Use the current match if target isn't better. return false; } diff --git a/tests/std/include/test_regex_support.hpp b/tests/std/include/test_regex_support.hpp index fc11c74929..0dadc54113 100644 --- a/tests/std/include/test_regex_support.hpp +++ b/tests/std/include/test_regex_support.hpp @@ -116,6 +116,59 @@ class regex_fixture { } } + void should_capture_groups(const std::string& subject, const std::string& pattern, + const std::vector& groups, + const std::regex_constants::syntax_option_type flags = std::regex_constants::ECMAScript) { + try { + const std::regex r(pattern, flags); + std::smatch m; + + if (!std::regex_match(subject, m, r)) { + printf(R"(Expected regex("%s") to match "%s".)" + "\n", + pattern.c_str(), subject.c_str()); + fail_regex(); + return; + } + + if (m[0] != subject) { + printf(R"(should_capture("%s", "%s"): not a full match)" + "\nexpected: \"%s\"" + "\nfound: \"%s\"\n", + subject.c_str(), pattern.c_str(), subject.c_str(), m[0].str().c_str()); + fail_regex(); + } + + if (m.size() != groups.size() + 1) { + printf(R"(should_capture("%s", "%s"): captures: %zd != %zd)" + "\n", + subject.c_str(), pattern.c_str(), m.size() - 1, groups.size()); + fail_regex(); + } + + for (size_t i = 0; i < groups.size(); ++i) { + if (m[i + 1] != groups[i]) { + printf(R"(should_capture("%s", "%s"): index %zd)" + "\nexpected: \"%s\"" + "\nfound: \"%s\"\n", + subject.c_str(), pattern.c_str(), i, groups[i].c_str(), m[i + 1].str().c_str()); + fail_regex(); + } + } + } catch (const std::regex_error& e) { + std::string groups_string = "|"; + for (const auto& el : groups) { + groups_string.append(el); + groups_string.push_back('|'); + } + + printf(R"(should_capture("%s", "%s", "%s"): regex_error: "%s")" + "\n", + subject.c_str(), pattern.c_str(), groups_string.c_str(), e.what()); + fail_regex(); + } + } + void should_replace_to(const std::string& subject, const std::string& pattern, const std::string& fmt, const std::regex_constants::match_flag_type match_flags, const std::string& expected) { try { diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index f5abd2955e..d51de683a8 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -546,6 +546,17 @@ void test_VSO_226914_word_boundaries() { aWordAny.should_search_fail("aa", match_not_bow | match_not_eow); } +void test_gh_731() { + std::vector groups{"AAA", "BBB", ""}; + g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*(B*)?)", groups); + + groups.assign({"AAA", "BBB"}); + g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*)", groups); + + groups.assign({"c"}); + g_regexTester.should_capture_groups("aabaac", "(aa|aabaac|ba|b|c)*", groups, std::regex_constants::extended); +} + void test_gh_993() { // GH-993 regex::icase is not handled correctly for some input. { @@ -607,6 +618,7 @@ int main() { test_VSO_225160_match_bol_flag(); test_VSO_225160_match_eol_flag(); test_VSO_226914_word_boundaries(); + test_gh_731(); test_gh_993(); return g_regexTester.result();