From f45ed7086224b6eff540e9350dcdcc7ea1dd386c Mon Sep 17 00:00:00 2001 From: Ievgen Polyvanyi Date: Tue, 22 Dec 2020 01:54:44 +0000 Subject: [PATCH 1/6] Fix better_match check used by regex search routine --- stl/inc/regex | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/stl/inc/regex b/stl/inc/regex index 8747ea9ed7..f257f7ddd7 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3399,6 +3399,27 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap template bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules + unsigned int _Res_valid_count = 0; + unsigned int _Tgt_state_valid_count = 0; + + for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { + if (_Res._Grp_valid[_Ix]) { + ++_Res_valid_count; + } + + if (_Tgt_state._Grp_valid[_Ix]) { + ++_Tgt_state_valid_count; + } + } + + // more captured groups means a better match + if (_Tgt_state_valid_count > _Res_valid_count) { + return true; + } + else if (_Tgt_state_valid_count < _Res_valid_count) { + return false; + } + for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) { if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) { From e7bec219cf5cf35e71306a6b0d3ca605438a0223 Mon Sep 17 00:00:00 2001 From: Ievgen Polyvanyi Date: Tue, 22 Dec 2020 23:29:21 +0000 Subject: [PATCH 2/6] Add unit tests and fix code style --- stl/inc/regex | 9 ++-- tests/std/include/test_regex_support.hpp | 43 +++++++++++++++++++ .../std/tests/VSO_0000000_regex_use/test.cpp | 6 +++ 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index f257f7ddd7..8960da3a59 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3399,7 +3399,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap template bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules - unsigned int _Res_valid_count = 0; + unsigned int _Res_valid_count = 0; unsigned int _Tgt_state_valid_count = 0; for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { @@ -3413,11 +3413,8 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for bet } // more captured groups means a better match - if (_Tgt_state_valid_count > _Res_valid_count) { - return true; - } - else if (_Tgt_state_valid_count < _Res_valid_count) { - return false; + if (_Tgt_state_valid_count != _Res_valid_count) { + return _Tgt_state_valid_count > _Res_valid_count; } for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group diff --git a/tests/std/include/test_regex_support.hpp b/tests/std/include/test_regex_support.hpp index fc11c74929..f4c3622cf5 100644 --- a/tests/std/include/test_regex_support.hpp +++ b/tests/std/include/test_regex_support.hpp @@ -3,6 +3,7 @@ #pragma once #include +#include #include #include @@ -116,6 +117,48 @@ class regex_fixture { } } + void should_capture_groups( + const std::string& subject, const std::string& pattern, const std::vector& groups) { + try { + const std::regex r(pattern); + std::smatch m; + + if (!std::regex_match(subject, m, r)) { + printf(R"(Expected regex("%s") to match "%s".)" + "\n", + pattern.c_str(), subject.c_str()); + fail_regex(); + return; + } + + if (m[0] != subject) { + printf(R"(should_capture("%s", "%s"): m[0] == "%s")" + "\n", + subject.c_str(), pattern.c_str(), m[0].str().c_str()); + fail_regex(); + } + + for (size_t i = 0; i < groups.size(); ++i) { + if (m[i + 1] != groups[i]) { + printf(R"(should_capture("%s", "%s"): m[%zd] == "%s")" + "\n", + groups[i].c_str(), pattern.c_str(), i, m[i + 1].str().c_str()); + fail_regex(); + } + } + } catch (const std::regex_error& e) { + std::string groups_string = std::accumulate(groups.cbegin(), groups.cend(), std::string(), + [](const std::string& result, const std::string& value) -> std::string { + return !result.empty() ? result + ";" + value : value; + }); + + printf(R"(should_capture("%s", "%s", "%s"): regex_error: "%s")" + "\n", + subject.c_str(), pattern.c_str(), groups_string.c_str(), e.what()); + fail_regex(); + } + } + void should_replace_to(const std::string& subject, const std::string& pattern, const std::string& fmt, const std::regex_constants::match_flag_type match_flags, const std::string& expected) { try { diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index f5abd2955e..9dfa182760 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -581,6 +581,11 @@ void test_gh_993() { } } +void test_gh_731() { + std::vector groups{"AAA", "BBB"}; + g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*)", groups); +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -607,6 +612,7 @@ int main() { test_VSO_225160_match_bol_flag(); test_VSO_225160_match_eol_flag(); test_VSO_226914_word_boundaries(); + test_gh_731(); test_gh_993(); return g_regexTester.result(); From af26bd491da82d6be455ae571d2132370efe0247 Mon Sep 17 00:00:00 2001 From: Ievgen Polyvanyi Date: Wed, 23 Dec 2020 01:11:49 +0000 Subject: [PATCH 3/6] Refer egrep mode with multiline captures --- stl/inc/regex | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 8960da3a59..1d46660a04 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3402,21 +3402,6 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for bet unsigned int _Res_valid_count = 0; unsigned int _Tgt_state_valid_count = 0; - for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { - if (_Res._Grp_valid[_Ix]) { - ++_Res_valid_count; - } - - if (_Tgt_state._Grp_valid[_Ix]) { - ++_Tgt_state_valid_count; - } - } - - // more captured groups means a better match - if (_Tgt_state_valid_count != _Res_valid_count) { - return _Tgt_state_valid_count > _Res_valid_count; - } - for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) { if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) { @@ -3428,8 +3413,18 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for bet return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End); } } + + if (_Res._Grp_valid[_Ix]) { + ++_Res_valid_count; + } + + if (_Tgt_state._Grp_valid[_Ix]) { + ++_Tgt_state_valid_count; + } } - return false; + + // more captured groups means a better match + return _Tgt_state_valid_count > _Res_valid_count; } template From 7cf7f371f19d550b6e23503d27f7ee25aa07a878 Mon Sep 17 00:00:00 2001 From: Nicole Mazzuca Date: Tue, 23 Aug 2022 11:47:17 -0700 Subject: [PATCH 4/6] use leftmost longest rule --- stl/inc/regex | 47 ++++++++++++------- tests/std/include/test_regex_support.hpp | 14 ++++-- .../std/tests/VSO_0000000_regex_use/test.cpp | 13 +++-- 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 1d46660a04..326afb53f0 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -3398,33 +3399,43 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap } template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules - unsigned int _Res_valid_count = 0; - unsigned int _Tgt_state_valid_count = 0; - +bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under leftmost-longest rules for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group - if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) { - if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) { - return _STD distance(_Begin, _Res._Grps[_Ix]._Begin) - < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin); - } - - if (_Res._Grps[_Ix]._End != _Tgt_state._Grps[_Ix]._End) { - return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End); + const auto& _Res_grp = _Res._Grps[_Ix]; + const auto& _Tgt_grp = _Tgt_state._Grps[_Ix]; + if (_Res_grp._Begin == _Res_grp._End) { + if (_Tgt_grp._Begin == _Tgt_grp._End) { + // Both groups are empty or invalid - matches are equivalent. + continue; } + // Target group is valid and non-empty, and current group is either empty or invalid. + return true; + } + if (_Tgt_grp._Begin == _Tgt_grp._End) { + // Target group is invalid or empty, and current group is valid and non-empty, + // so our current group is the better match. + return false; } - if (_Res._Grp_valid[_Ix]) { - ++_Res_valid_count; + if (_Res_grp._Begin != _Tgt_grp._Begin) { + // If one of the matches starts before the other, it is considered better. + // This is also known as "leftmost", in languages written from left to right. + ptrdiff_t _Res_dist_from_begin = _STD distance(_Begin, _Res_grp._Begin); + ptrdiff_t _Tgt_dist_from_begin = _STD distance(_Begin, _Tgt_grp._Begin); + return _Tgt_dist_from_begin < _Res_dist_from_begin; } - if (_Tgt_state._Grp_valid[_Ix]) { - ++_Tgt_state_valid_count; + if (_Res_grp._End != _Tgt_grp._End) { + // If one of the matches is longer than the other, and they start at the same point, + // it is considered better. + ptrdiff_t _Res_size = _STD distance(_Res_grp._Begin, _Res_grp._End); + ptrdiff_t _Tgt_size = _STD distance(_Tgt_grp._Begin, _Tgt_grp._End); + return _Tgt_size > _Res_size; } } - // more captured groups means a better match - return _Tgt_state_valid_count > _Res_valid_count; + // Use the current match if target isn't better. + return false; } template diff --git a/tests/std/include/test_regex_support.hpp b/tests/std/include/test_regex_support.hpp index f4c3622cf5..5dadc64745 100644 --- a/tests/std/include/test_regex_support.hpp +++ b/tests/std/include/test_regex_support.hpp @@ -138,11 +138,19 @@ class regex_fixture { fail_regex(); } + if (m.size() != groups.size() + 1) { + printf(R"(should_capture("%s", "%s"): captures: %zd != %zd)" + "\n", + subject.c_str(), pattern.c_str(), m.size() - 1, groups.size()); + fail_regex(); + } + for (size_t i = 0; i < groups.size(); ++i) { if (m[i + 1] != groups[i]) { - printf(R"(should_capture("%s", "%s"): m[%zd] == "%s")" - "\n", - groups[i].c_str(), pattern.c_str(), i, m[i + 1].str().c_str()); + printf(R"(should_capture("%s", "%s"): index %zd)" + "\nexpected: \"%s\"" + "\nfound: \"%s\"\n", + subject.c_str(), pattern.c_str(), i, m[i + 1].str().c_str(), groups[i].c_str()); fail_regex(); } } diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 9dfa182760..11cfad8db9 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -546,6 +546,14 @@ void test_VSO_226914_word_boundaries() { aWordAny.should_search_fail("aa", match_not_bow | match_not_eow); } +void test_gh_731() { + std::vector groups{"AAA", "BBB", ""}; + g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*(B*)?)", groups); + + groups.assign({"AAA", "BBB"}); + g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*)", groups); +} + void test_gh_993() { // GH-993 regex::icase is not handled correctly for some input. { @@ -581,11 +589,6 @@ void test_gh_993() { } } -void test_gh_731() { - std::vector groups{"AAA", "BBB"}; - g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*)", groups); -} - int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); From de9c79901311301924108bb15d67286b470da34d Mon Sep 17 00:00:00 2001 From: Nicole Mazzuca Date: Tue, 23 Aug 2022 14:07:18 -0700 Subject: [PATCH 5/6] oh, oops ECMAScript uses leftmost longest rules, but UNIX uses rightmost longest rules --- stl/inc/regex | 14 ++++++++++---- tests/std/include/test_regex_support.hpp | 16 +++++++++------- tests/std/tests/VSO_0000000_regex_use/test.cpp | 3 +++ 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 326afb53f0..f7d676431f 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3399,7 +3399,8 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap } template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under leftmost-longest rules +bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { + // check for better match under the rules of the current regex implementation for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group const auto& _Res_grp = _Res._Grps[_Ix]; const auto& _Tgt_grp = _Tgt_state._Grps[_Ix]; @@ -3418,11 +3419,16 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for bet } if (_Res_grp._Begin != _Tgt_grp._Begin) { - // If one of the matches starts before the other, it is considered better. - // This is also known as "leftmost", in languages written from left to right. ptrdiff_t _Res_dist_from_begin = _STD distance(_Begin, _Res_grp._Begin); ptrdiff_t _Tgt_dist_from_begin = _STD distance(_Begin, _Tgt_grp._Begin); - return _Tgt_dist_from_begin < _Res_dist_from_begin; + if (_Sflags & regex_constants::ECMAScript) { + // If one of the matches starts before the other, in ECMAScript, it is considered better. + // This is also known as "leftmost", in languages written from left to right. + return _Tgt_dist_from_begin < _Res_dist_from_begin; + } else { + // Otherwise, Unix uses "rightmost" rules. + return _Res_dist_from_begin < _Tgt_dist_from_begin; + } } if (_Res_grp._End != _Tgt_grp._End) { diff --git a/tests/std/include/test_regex_support.hpp b/tests/std/include/test_regex_support.hpp index 5dadc64745..95eb278e26 100644 --- a/tests/std/include/test_regex_support.hpp +++ b/tests/std/include/test_regex_support.hpp @@ -117,10 +117,11 @@ class regex_fixture { } } - void should_capture_groups( - const std::string& subject, const std::string& pattern, const std::vector& groups) { + void should_capture_groups(const std::string& subject, const std::string& pattern, + const std::vector& groups, + const std::regex_constants::syntax_option_type flags = std::regex_constants::ECMAScript) { try { - const std::regex r(pattern); + const std::regex r(pattern, flags); std::smatch m; if (!std::regex_match(subject, m, r)) { @@ -132,9 +133,10 @@ class regex_fixture { } if (m[0] != subject) { - printf(R"(should_capture("%s", "%s"): m[0] == "%s")" - "\n", - subject.c_str(), pattern.c_str(), m[0].str().c_str()); + printf(R"(should_capture("%s", "%s"): not a full match)" + "\nexpected: \"%s\"" + "\nfound: \"%s\"\n", + subject.c_str(), pattern.c_str(), subject.c_str(), m[0].str().c_str()); fail_regex(); } @@ -150,7 +152,7 @@ class regex_fixture { printf(R"(should_capture("%s", "%s"): index %zd)" "\nexpected: \"%s\"" "\nfound: \"%s\"\n", - subject.c_str(), pattern.c_str(), i, m[i + 1].str().c_str(), groups[i].c_str()); + subject.c_str(), pattern.c_str(), i, groups[i].c_str(), m[i + 1].str().c_str()); fail_regex(); } } diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 11cfad8db9..d51de683a8 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -552,6 +552,9 @@ void test_gh_731() { groups.assign({"AAA", "BBB"}); g_regexTester.should_capture_groups("AAA BBB", R"((A+)\s*(B+)?\s*B*)", groups); + + groups.assign({"c"}); + g_regexTester.should_capture_groups("aabaac", "(aa|aabaac|ba|b|c)*", groups, std::regex_constants::extended); } void test_gh_993() { From 2f37a4756bd9cdb46bae053e06bb28a52f5f1229 Mon Sep 17 00:00:00 2001 From: Nicole Mazzuca Date: Tue, 23 Aug 2022 16:08:37 -0700 Subject: [PATCH 6/6] remove includes --- stl/inc/regex | 1 - tests/std/include/test_regex_support.hpp | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index f7d676431f..4b9faec9da 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/tests/std/include/test_regex_support.hpp b/tests/std/include/test_regex_support.hpp index 95eb278e26..0dadc54113 100644 --- a/tests/std/include/test_regex_support.hpp +++ b/tests/std/include/test_regex_support.hpp @@ -3,7 +3,6 @@ #pragma once #include -#include #include #include @@ -157,10 +156,11 @@ class regex_fixture { } } } catch (const std::regex_error& e) { - std::string groups_string = std::accumulate(groups.cbegin(), groups.cend(), std::string(), - [](const std::string& result, const std::string& value) -> std::string { - return !result.empty() ? result + ";" + value : value; - }); + std::string groups_string = "|"; + for (const auto& el : groups) { + groups_string.append(el); + groups_string.push_back('|'); + } printf(R"(should_capture("%s", "%s", "%s"): regex_error: "%s")" "\n",