Regular expressions are powerful but tricky. Here's how to use them effectively in Python.
Basic Matching
import re
# Check if pattern exists
if re.search(r"hello", "hello world"):
print("Found")
# Match at start only
if re.match(r"hello", "hello world"):
print("Starts with hello")
# Full string match
if re.fullmatch(r"hello", "hello"):
print("Exact match")
# Find all occurrences
matches = re.findall(r"\d+", "abc 123 def 456")
print(matches) # ['123', '456']Raw Strings
# Always use raw strings (r"...") for patterns
pattern = r"\d+" # Good: literal backslash-d
pattern = "\\d+" # Works but harder to read
# Especially important for:
r"\n" # Matches literal \n
"\n" # Matches newline characterMatch Objects
import re
text = "Email: alice@example.com"
match = re.search(r"(\w+)@(\w+)\.(\w+)", text)
if match:
print(match.group()) # alice@example.com
print(match.group(1)) # alice
print(match.group(2)) # example
print(match.groups()) # ('alice', 'example', 'com')
print(match.start()) # 7
print(match.end()) # 24
print(match.span()) # (7, 24)Named Groups
import re
pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<tld>\w+)"
match = re.search(pattern, "alice@example.com")
if match:
print(match.group("user")) # alice
print(match.group("domain")) # example
print(match.groupdict()) # {'user': 'alice', 'domain': 'example', 'tld': 'com'}Flags
import re
# Case insensitive
re.search(r"hello", "HELLO", re.IGNORECASE)
re.search(r"(?i)hello", "HELLO") # Inline flag
# Multiline (^ and $ match line boundaries)
re.findall(r"^\w+", "line1\nline2", re.MULTILINE)
# Dotall (. matches newlines too)
re.search(r"a.b", "a\nb", re.DOTALL)
# Verbose (allow comments and whitespace)
pattern = re.compile(r"""
\d{3} # Area code
[-.]? # Optional separator
\d{3} # First 3 digits
[-.]? # Optional separator
\d{4} # Last 4 digits
""", re.VERBOSE)
# Combine flags
re.search(r"hello", "HELLO\nWORLD", re.IGNORECASE | re.MULTILINE)Compiled Patterns
import re
# Compile once, use many times
EMAIL_RE = re.compile(r"[\w.-]+@[\w.-]+\.\w+")
def has_email(text):
return EMAIL_RE.search(text) is not None
def find_emails(text):
return EMAIL_RE.findall(text)Substitution
import re
# Simple replace
result = re.sub(r"\d+", "X", "abc 123 def 456")
print(result) # abc X def X
# With backreferences
result = re.sub(r"(\w+)@(\w+)", r"\2@\1", "alice@example")
print(result) # example@alice
# With function
def double(match):
return str(int(match.group()) * 2)
result = re.sub(r"\d+", double, "abc 5 def 10")
print(result) # abc 10 def 20
# Limit replacements
result = re.sub(r"\d+", "X", "1 2 3 4 5", count=2)
print(result) # X X 3 4 5Splitting
import re
# Split on pattern
parts = re.split(r"\s+", "hello world foo")
print(parts) # ['hello', 'world', 'foo']
# Keep delimiters
parts = re.split(r"(\s+)", "hello world")
print(parts) # ['hello', ' ', 'world']
# Limit splits
parts = re.split(r"\s+", "a b c d e", maxsplit=2)
print(parts) # ['a', 'b', 'c d e']Common Patterns
import re
# Email (simplified)
EMAIL = r"[\w.-]+@[\w.-]+\.\w+"
# URL
URL = r"https?://[\w.-]+(?:/[\w./-]*)?"
# Phone (US)
PHONE = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
# Date (YYYY-MM-DD)
DATE = r"\d{4}-\d{2}-\d{2}"
# Time (HH:MM:SS)
TIME = r"\d{2}:\d{2}(?::\d{2})?"
# IPv4
IPV4 = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
# Slug (URL-safe string)
SLUG = r"[a-z0-9]+(?:-[a-z0-9]+)*"
# Username
USERNAME = r"[a-zA-Z][a-zA-Z0-9_]{2,29}"Non-Greedy Matching
import re
text = "<tag>content</tag>"
# Greedy (default): matches as much as possible
print(re.search(r"<.*>", text).group()) # <tag>content</tag>
# Non-greedy: matches as little as possible
print(re.search(r"<.*?>", text).group()) # <tag>Lookahead and Lookbehind
import re
# Positive lookahead: match only if followed by
re.findall(r"\d+(?= dollars)", "100 dollars 50 euros") # ['100']
# Negative lookahead: match only if NOT followed by
re.findall(r"\d+(?! dollars)", "100 dollars 50 euros") # ['50']
# Positive lookbehind: match only if preceded by
re.findall(r"(?<=\$)\d+", "$100 £50") # ['100']
# Negative lookbehind: match only if NOT preceded by
re.findall(r"(?<!\$)\d+", "$100 £50") # ['50']Practical Examples
import re
# Extract domain from URL
def get_domain(url):
match = re.search(r"https?://([^/]+)", url)
return match.group(1) if match else None
# Validate password strength
def is_strong_password(password):
checks = [
r".{8,}", # At least 8 chars
r"[A-Z]", # Has uppercase
r"[a-z]", # Has lowercase
r"\d", # Has digit
r"[!@#$%^&*]", # Has special char
]
return all(re.search(p, password) for p in checks)
# Clean whitespace
def normalize_whitespace(text):
return re.sub(r"\s+", " ", text).strip()
# Extract hashtags
def find_hashtags(text):
return re.findall(r"#(\w+)", text)
# Mask sensitive data
def mask_email(text):
return re.sub(
r"(\w)[^@]*(@\w+\.\w+)",
r"\1***\2",
text
)Escaping
import re
# Escape special characters
user_input = "hello (world)"
pattern = re.escape(user_input)
print(pattern) # hello\ \(world\)
# Safe search with user input
def safe_search(text, query):
return re.search(re.escape(query), text)Performance Tips
import re
# Compile frequently used patterns
PATTERN = re.compile(r"\d+")
# Use non-capturing groups when you don't need captures
r"(?:https?://)?" # Non-capturing
r"(https?://)?" # Capturing (slower, stores match)
# Anchor patterns when possible
r"^start" # Faster than r"start" at beginning
r"end$" # Faster than r"end" at end
# Be specific
r"[0-9]" # Same as \d but clearer intent
r"[a-zA-Z0-9_]" # Same as \wCommon Mistakes
import re
# Mistake: forgetting raw string
re.search("\d+", "123") # Works but confusing
re.search(r"\d+", "123") # Better
# Mistake: using match when you want search
re.match(r"world", "hello world") # None (match is at start only)
re.search(r"world", "hello world") # Match found
# Mistake: greedy when you want non-greedy
re.sub(r"<.*>", "", "<a>text</a>") # Removes everything
re.sub(r"<.*?>", "", "<a>text</a>") # Removes tags only
# Mistake: not escaping user input
user = "hello.*"
re.search(user, text) # Dangerous
re.search(re.escape(user), text) # SafeRegular expressions are a precision tool. Use them for pattern matching, but don't force them where simpler string methods work.
React to this post: