Regular expressions are powerful for text processing. Here's how to use Python's re module effectively.
Basic Functions
import re
text = "Hello, World! Hello, Python!"
# Search for pattern (first match)
match = re.search(r"Hello", text)
if match:
print(match.group()) # "Hello"
# Match at start only
match = re.match(r"Hello", text)
# Find all matches
matches = re.findall(r"Hello", text) # ["Hello", "Hello"]
# Find with match objects
for match in re.finditer(r"Hello", text):
print(match.start(), match.end())
# Replace
new_text = re.sub(r"Hello", "Hi", text) # "Hi, World! Hi, Python!"Pattern Syntax
Basic Patterns
| Pattern | Meaning |
|---|---|
. | Any character (except newline) |
^ | Start of string |
$ | End of string |
* | 0 or more |
+ | 1 or more |
? | 0 or 1 |
{n} | Exactly n |
{n,m} | Between n and m |
Character Classes
| Pattern | Meaning |
|---|---|
\d | Digit [0-9] |
\D | Non-digit |
\w | Word char [a-zA-Z0-9_] |
\W | Non-word char |
\s | Whitespace |
\S | Non-whitespace |
[abc] | a, b, or c |
[^abc] | Not a, b, or c |
[a-z] | Lowercase letter |
Compiling Patterns
For reuse, compile patterns:
pattern = re.compile(r"\d{3}-\d{4}")
# Use like functions
pattern.search(text)
pattern.findall(text)
pattern.sub("XXX-XXXX", text)Groups
Capture parts of matches:
# Named groups
pattern = r"(?P<area>\d{3})-(?P<number>\d{4})"
match = re.search(pattern, "Call 555-1234")
match.group("area") # "555"
match.group("number") # "1234"
match.groups() # ("555", "1234")
match.groupdict() # {"area": "555", "number": "1234"}
# Numbered groups
pattern = r"(\d{3})-(\d{4})"
match = re.search(pattern, "Call 555-1234")
match.group(1) # "555"
match.group(2) # "1234"Flags
Modify pattern behavior:
# Case insensitive
re.search(r"hello", "HELLO", re.IGNORECASE)
re.search(r"hello", "HELLO", re.I)
# Multiline (^ and $ match line boundaries)
re.findall(r"^\w+", "line1\nline2", re.MULTILINE)
# Dot matches newline
re.search(r"a.b", "a\nb", re.DOTALL)
# Verbose (allow comments)
pattern = re.compile(r"""
\d{3} # Area code
- # Separator
\d{4} # Number
""", re.VERBOSE)Common Patterns
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
re.findall(email_pattern, text)URL
url_pattern = r"https?://[^\s]+"
re.findall(url_pattern, text)Phone number
phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
re.findall(phone_pattern, text)IP address
ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
re.findall(ip_pattern, text)Substitution
# Simple replace
re.sub(r"\d+", "X", "abc123def456") # "abcXdefX"
# Using groups
re.sub(r"(\w+), (\w+)", r"\2 \1", "Doe, John") # "John Doe"
# With function
def double(match):
return str(int(match.group()) * 2)
re.sub(r"\d+", double, "5 cats and 3 dogs") # "10 cats and 6 dogs"
# Limit replacements
re.sub(r"a", "X", "banana", count=2) # "bXnXna"Splitting
# Split on pattern
re.split(r"\s+", "hello world") # ["hello", "world"]
# Keep delimiters
re.split(r"(\s+)", "a b c") # ["a", " ", "b", " ", "c"]
# Limit splits
re.split(r",", "a,b,c,d", maxsplit=2) # ["a", "b", "c,d"]Lookahead and Lookbehind
Match without consuming:
# Positive lookahead (?=...)
re.findall(r"\w+(?=@)", "user@example.com") # ["user"]
# Negative lookahead (?!...)
re.findall(r"\d+(?!%)", "50% and 100") # ["100"]
# Positive lookbehind (?<=...)
re.findall(r"(?<=\$)\d+", "$50 and $100") # ["50", "100"]
# Negative lookbehind (?<!...)
re.findall(r"(?<!\$)\d+", "50 and $100") # ["50"]Non-Greedy Matching
text = "<tag>content</tag>"
# Greedy (default)
re.search(r"<.*>", text).group() # "<tag>content</tag>"
# Non-greedy
re.search(r"<.*?>", text).group() # "<tag>"Practical Examples
Parse log line
log = "2026-03-21 14:30:00 ERROR: Connection failed"
pattern = r"(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2}) (?P<level>\w+): (?P<message>.+)"
match = re.match(pattern, log)
match.groupdict()Extract data from HTML
html = '<a href="https://example.com">Link</a>'
pattern = r'href="([^"]+)"'
re.findall(pattern, html) # ["https://example.com"]Validate input
def is_valid_username(username):
pattern = r"^[a-zA-Z][a-zA-Z0-9_]{2,19}$"
return bool(re.match(pattern, username))Clean whitespace
def normalize_whitespace(text):
return re.sub(r"\s+", " ", text.strip())Quick Reference
import re
# Search
re.search(pattern, text) # First match
re.match(pattern, text) # Match at start
re.fullmatch(pattern, text) # Match entire string
# Find all
re.findall(pattern, text) # List of matches
re.finditer(pattern, text) # Iterator of Match objects
# Replace
re.sub(pattern, repl, text)
re.subn(pattern, repl, text) # Also returns count
# Split
re.split(pattern, text)
# Compile
pattern = re.compile(r"...")Use raw strings (r"...") for patterns to avoid escape issues. For complex text processing, regex is your friend.
React to this post: