Source code for sentency.regex
[docs]def regexize_keywords(
keyword_str, keyword_delimiter=" ", line_delimiter="\n", case_insensitive=True
):
r"""Convert a string of keywords into a regular expression that can be used as input
for the sentenCy sentex component. You can separate keywords individually or into
groups using keyword and line delimiters.
Example usage:
>>> from sentency.regex import regexize_keywords
>>> keyword_str = "abdominal aortic aneurysm\naneurysm abdominal aorta"
>>> regexize_keywords(keyword_str)
(?i)((abdominal.*aortic.*aneurysm)|(aneurysm.*abdominal.*aorta))
keyword_str: `str`, The keyword string to be converted into a regular expression.
keyword_delimiter: `str`, The delimiter separating individual keywords in
`keyword_str`. Default is `' '`
line_delimiter: `str`, The string separating lines into a regular expression.
Default is `'\n'`
case_insensitive: `bool`, Should the regular expression be case-insensitive?
RETURNS: `str`, The regular expression
"""
keyword_str = keyword_str.strip()
keyword_phrases = keyword_str.split(line_delimiter)
keyword_regexes = [
f'({keyword.replace(keyword_delimiter, ".*")})' for keyword in keyword_phrases
]
ci_flag = "(?i)" if case_insensitive else ""
regex = f"{ci_flag}({'|'.join(keyword_regexes)})"
return regex