Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent attempts at nesting character sets in regex filter #389

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 52 additions & 20 deletions futaba/cogs/filter/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,32 +103,64 @@ def build_regex(text, groups):

return str(pattern)

@staticmethod
def generate_confusable_literals(confusable: chr) -> list:
# Generates list of confusable homoglyphs LITERALs based on input
# character, includes input character

groups = confusables.is_confusable(confusable, greedy=True)

confusable_literals = [(sre_parse.LITERAL, ord(confusable))]

if not groups:
return confusable_literals

# Append confusables to list as LITERALs
for homoglyph in groups[0]["homoglyphs"]:
confusable_literals += [
(re.sre_parse.LITERAL, ord(char)) for char in homoglyph["c"]
]

return confusable_literals

@staticmethod
def convert_raw_regex_ast(regex_ast: Iterable):
for index, value in enumerate(regex_ast):
# Parse lexemes for LITERALs
if isinstance(value, tuple):
lexeme_tuple = value
if lexeme_tuple[0] == sre_parse.LITERAL:
# LITERAL found, check if it's a confusable homoglyph...
groups = confusables.is_confusable(
chr(lexeme_tuple[1]), greedy=True
# (TOKEN_NAME, TOKEN_VALUE) likely
(
token_name,
token_value,
*_,
) = value
if token_name == sre_parse.IN:
# (IN, [ (LITERAL, ord) ... ])
# sets cannot be nested, need to insert confusables in place
confusable_literals = []
for set_token in token_value:
if set_token[0] == sre_parse.RANGE:
# (RANGE, (LOWER_BOUND, UPPER_BOUND)), e.g. a-z in [a-z]
lower_bound = set_token[1][0]
upper_bound = set_token[1][1]
for ord_value in range(lower_bound, upper_bound + 1):
confusable_literals += (
Filter.generate_confusable_literals(chr(ord_value))
)
else:
# Must be LITERAL
confusable_literals += Filter.generate_confusable_literals(
chr(set_token[1])
)
token_value += confusable_literals
if token_name == sre_parse.LITERAL:
# (LITERAL, ord), replace with set
confusable_literals = Filter.generate_confusable_literals(
chr(token_value)
)
if not groups:
continue
# Convert group into list of confusable LITERALs
group = groups[0] # one char, so only one group
confusable_literals = [lexeme_tuple]
for homoglyph in group["homoglyphs"]:
confusable_literals += [
(sre_parse.LITERAL, ord(char)) for char in homoglyph["c"]
]
in_lexeme_tuple = (sre_parse.IN, confusable_literals)

# Overwrite this lexeme
regex_ast[index] = in_lexeme_tuple
regex_ast[index] = (sre_parse.IN, confusable_literals)
else:
# More possible lexemes, recurse and overwrite...
# Miscellaneous token, might contain more tokens
# Recurse and overwrite...
regex_ast[index] = tuple(Filter.convert_raw_regex_ast(list(value)))
elif isinstance(value, sre_parse.SubPattern):
regex_ast[index] = Filter.convert_raw_regex_ast(value)
Expand Down