Python re module: patterns, flags, match/search/findall, groups, and substitution
Match specific sets of characters
. # any char except newline
\d # digit [0-9]
\D # non-digit
\w # word char [a-zA-Z0-9_]
\W # non-word char
\s # whitespace
[abc] # a, b, or c
[^abc] # not a, b, or c
[a-z] # lowercase letterControl how many times a pattern repeats
* # 0 or more (greedy)
+ # 1 or more (greedy)
? # 0 or 1 (optional)
{3} # exactly 3 times
{2,5} # 2 to 5 times
*? # 0 or more (non-greedy)
+? # 1 or more (non-greedy)Match positions in a string, not characters
^ # start of string (or line with re.M)
$ # end of string (or line with re.M)
\b # word boundary
\B # non-word boundary
\A # absolute start of string
\Z # absolute end of stringCapture and reference parts of a match
(abc) # capturing group
(?:abc) # non-capturing group
(?P<name>abc) # named group
(a|b|c) # alternation (a or b or c)
(?=abc) # positive lookahead
(?!abc) # negative lookaheadMake pattern matching case-insensitive
re.search(r"python", text, re.IGNORECASE)
# matches "Python", "PYTHON", "python"Make ^ and $ match start/end of each line
re.findall(r"^\w+", text, re.MULTILINE)
# matches first word on each lineMake . match newline characters too
re.search(r"start.*end", text, re.DOTALL)
# matches across multiple linesAllow whitespace and comments in patterns
pattern = re.compile(r"""
\d{4} # year
- # separator
\d{2} # month
""", re.VERBOSE)match checks only at start; search checks anywhere
import re
re.match(r"\d+", "abc123") # None (not at start)
re.search(r"\d+", "abc123") # match object for "123"Find all occurrences as a list or iterator
re.findall(r"\d+", "a1 b22 c333") # ["1", "22", "333"]
for m in re.finditer(r"\d+", text):
print(m.group(), m.start(), m.end())Replace matches with a string or function
re.sub(r"\s+", " ", text) # collapse whitespace
re.sub(r"(\w+)", r"[\1]", text) # wrap words in brackets
re.sub(r"\d+", lambda m: str(int(m.group())*2), text)Split string at each match
re.split(r"[,;\s]+", "a, b; c d") # ["a", "b", "c", "d"]
re.split(r"(,)", "a,b,c") # ["a", ",", "b", ",", "c"]Compile a pattern for repeated use
pattern = re.compile(r"\b\w{5}\b", re.IGNORECASE)
pattern.findall("Hello world there")
# ["Hello", "world", "there"]Access match groups by name for clarity
m = re.search(r"(?P<year>\d{4})-(?P<month>\d{2})", "2024-01")
m.group("year") # "2024"
m.group("month") # "01"
m.groupdict() # {"year": "2024", "month": "01"}Refer back to a captured group in the same pattern
re.search(r"(\w+) \1", "hello hello") # matches repeated word
re.sub(r"(\w+) \1", r"\1", text) # remove duplicatesMatch a common email address format
import re
EMAIL_RE = re.compile(
r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$"
)
def is_valid_email(email: str) -> bool:
return bool(EMAIL_RE.match(email))
is_valid_email("user@example.com") # True
is_valid_email("not-an-email") # FalseFind all URLs in a block of text
import re
URL_RE = re.compile(
r'https?://[^\s<>"{}|\\^\[\]]+',
re.IGNORECASE
)
text = "Visit https://example.com or http://docs.python.org"
urls = URL_RE.findall(text)
# ["https://example.com", "http://docs.python.org"]Extract structured data from log lines
import re
from datetime import datetime
LOG_RE = re.compile(
r"(?P<date>\d{4}-\d{2}-\d{2}) "
r"(?P<time>\d{2}:\d{2}:\d{2}) "
r"(?P<level>\w+) "
r"(?P<message>.+)"
)
line = "2024-01-15 14:30:00 ERROR Database connection failed"
m = LOG_RE.match(line)
if m:
print(m.group("level")) # ERROR
print(m.group("message")) # Database connection failedAlways use raw strings (r"\d+") for regex patterns to avoid double-escaping backslashes
Use re.compile() when applying the same pattern multiple times for better performance
Use named groups ((?P<name>...)) in complex patterns to make matches self-documenting
Test patterns interactively at regex101.com with Python flavor selected
Prefer non-greedy quantifiers (*?, +?) when matching HTML or nested structures