Beyond basic matching, Python's re module offers powerful pattern capabilities.
Compiled Patterns
import re
# Compile once, use many times
EMAIL = re.compile(r'[\w.-]+@[\w.-]+\.\w+')
emails = EMAIL.findall(text)
if EMAIL.match(user_input):
# valid email format
passFlags
import re
# Case insensitive
re.search(r'hello', 'HELLO', re.IGNORECASE)
re.search(r'hello', 'HELLO', re.I)
# Multiline (^ and $ match line boundaries)
re.findall(r'^item', text, re.MULTILINE)
re.findall(r'^item', text, re.M)
# Dotall (. matches newlines)
re.search(r'start.*end', text, re.DOTALL)
re.search(r'start.*end', text, re.S)
# Verbose (allow comments and whitespace)
pattern = re.compile(r'''
\d{3} # Area code
[-.\s]? # Optional separator
\d{3} # Exchange
[-.\s]? # Optional separator
\d{4} # Number
''', re.VERBOSE)
# Combine flags
re.search(r'pattern', text, re.I | re.M | re.S)Named Groups
import re
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.match(pattern, '2026-03-21')
print(match.group('year')) # 2026
print(match.group('month')) # 03
print(match.groupdict()) # {'year': '2026', 'month': '03', 'day': '21'}Non-Capturing Groups
import re
# Capturing group
re.findall(r'(https?)://(\S+)', text) # Returns tuples
# Non-capturing group
re.findall(r'(?:https?)://(\S+)', text) # Only captures URLLookahead and Lookbehind
import re
# Positive lookahead: match if followed by
re.findall(r'\w+(?=@)', 'user@example.com') # ['user']
# Negative lookahead: match if NOT followed by
re.findall(r'\d+(?!px)', '100px 200em 300') # ['200', '300']
# Positive lookbehind: match if preceded by
re.findall(r'(?<=\$)\d+', 'Price: $100') # ['100']
# Negative lookbehind: match if NOT preceded by
re.findall(r'(?<!\$)\d+', '$100 200') # ['00', '200']Substitution
import re
# Basic replace
re.sub(r'\s+', ' ', 'too many spaces')
# With function
def double(match):
return str(int(match.group()) * 2)
re.sub(r'\d+', double, 'a1b2c3') # 'a2b4c6'
# With groups
re.sub(r'(\w+)@(\w+)', r'\2:\1', 'user@host') # 'host:user'
# Limit replacements
re.sub(r'\d', 'X', '123456', count=3) # 'XXX456'Split
import re
# Split on pattern
re.split(r'[,;\s]+', 'a, b; c d') # ['a', 'b', 'c', 'd']
# Keep delimiters
re.split(r'([,;])', 'a,b;c') # ['a', ',', 'b', ';', 'c']
# Limit splits
re.split(r'\s', 'a b c d', maxsplit=2) # ['a', 'b', 'c d']finditer (Memory Efficient)
import re
# For large texts, use iterator
for match in re.finditer(r'\b\w+@\w+\.\w+\b', huge_text):
print(match.group(), match.start(), match.end())Match Object Methods
import re
text = "Hello, World!"
match = re.search(r'(\w+), (\w+)', text)
match.group() # 'Hello, World'
match.group(0) # Same as above
match.group(1) # 'Hello'
match.group(2) # 'World'
match.groups() # ('Hello', 'World')
match.start() # 0
match.end() # 12
match.span() # (0, 12)Greedy vs Non-Greedy
import re
text = '<tag>content</tag>'
# Greedy (default)
re.findall(r'<.*>', text) # ['<tag>content</tag>']
# Non-greedy
re.findall(r'<.*?>', text) # ['<tag>', '</tag>']
# Also works with +, ?
re.findall(r'\d+?', '12345') # ['1', '2', '3', '4', '5']Atomic Groups (Python 3.11+)
import re
# Possessive quantifiers prevent backtracking
re.search(r'a++b', 'aaab') # Faster, no backtrackPractical Patterns
Email Validation
EMAIL = re.compile(r'''
^ # Start
[\w.+-]+ # Local part
@ # At symbol
[a-zA-Z\d.-]+ # Domain
\. # Dot
[a-zA-Z]{2,} # TLD
$ # End
''', re.VERBOSE)URL Extraction
URL = re.compile(r'''
https?:// # Protocol
(?:[\w-]+\.)+ # Subdomains
[\w-]+ # Domain
(?:/[\w./-]*)? # Path
(?:\?[\w=&]*)? # Query
''', re.VERBOSE)Password Validation
def is_strong_password(password):
patterns = [
r'.{8,}', # At least 8 chars
r'[A-Z]', # Uppercase
r'[a-z]', # Lowercase
r'\d', # Digit
r'[!@#$%^&*]', # Special char
]
return all(re.search(p, password) for p in patterns)Log Parsing
LOG_PATTERN = re.compile(r'''
(?P<ip>\d+\.\d+\.\d+\.\d+)\s+
\[(?P<timestamp>[^\]]+)\]\s+
"(?P<method>\w+)\s+(?P<path>\S+)\s+HTTP/\d\.\d"\s+
(?P<status>\d+)\s+
(?P<size>\d+)
''', re.VERBOSE)
for match in LOG_PATTERN.finditer(log_file):
print(match.groupdict())HTML Tag Extraction
# Simple tag matching (use proper parser for complex HTML)
TAGS = re.compile(r'<(\w+)[^>]*>(.*?)</\1>', re.DOTALL)
for tag, content in TAGS.findall(html):
print(f"{tag}: {content[:50]}")Performance Tips
import re
# Compile patterns used multiple times
pattern = re.compile(r'\d+')
# Use raw strings to avoid escape issues
r'\n\t\d+' # Correct
'\\n\\t\\d+' # Works but messy
# Anchor patterns when possible
re.match(r'start', text) # Faster than search with ^
# Avoid catastrophic backtracking
# Bad: r'(a+)+b'
# Good: r'a+b'Escape Special Characters
import re
# Escape user input
user_input = "cost: $100"
safe = re.escape(user_input)
# 'cost:\\ \\$100'
re.search(safe, text) # Safe literal matchSummary
Advanced re patterns:
- Flags:
re.I,re.M,re.S,re.X - Groups: Named
(?P<name>), non-capturing(?:) - Lookaround:
(?=),(?!),(?<=),(?<!) - Greedy/lazy:
*vs*?,+vs+? - Substitution:
re.sub()with functions - Performance: Compile patterns, anchor when possible
Master regex for text processing and validation.
React to this post: