Python re Module Guide

Regular expressions are powerful for text processing. Here's how to use Python's re module effectively.

Basic Functions

import re
 
text = "Hello, World! Hello, Python!"
 
# Search for pattern (first match)
match = re.search(r"Hello", text)
if match:
    print(match.group())  # "Hello"
 
# Match at start only
match = re.match(r"Hello", text)
 
# Find all matches
matches = re.findall(r"Hello", text)  # ["Hello", "Hello"]
 
# Find with match objects
for match in re.finditer(r"Hello", text):
    print(match.start(), match.end())
 
# Replace
new_text = re.sub(r"Hello", "Hi", text)  # "Hi, World! Hi, Python!"

Pattern Syntax

Basic Patterns

Pattern	Meaning
`.`	Any character (except newline)
`^`	Start of string
`$`	End of string
`*`	0 or more
`+`	1 or more
`?`	0 or 1
`{n}`	Exactly n
`{n,m}`	Between n and m

Character Classes

Pattern	Meaning
`\d`	Digit [0-9]
`\D`	Non-digit
`\w`	Word char [a-zA-Z0-9_]
`\W`	Non-word char
`\s`	Whitespace
`\S`	Non-whitespace
`[abc]`	a, b, or c
`[^abc]`	Not a, b, or c
`[a-z]`	Lowercase letter

Compiling Patterns

For reuse, compile patterns:

pattern = re.compile(r"\d{3}-\d{4}")
 
# Use like functions
pattern.search(text)
pattern.findall(text)
pattern.sub("XXX-XXXX", text)

Groups

Capture parts of matches:

# Named groups
pattern = r"(?P<area>\d{3})-(?P<number>\d{4})"
match = re.search(pattern, "Call 555-1234")
 
match.group("area")    # "555"
match.group("number")  # "1234"
match.groups()         # ("555", "1234")
match.groupdict()      # {"area": "555", "number": "1234"}
 
# Numbered groups
pattern = r"(\d{3})-(\d{4})"
match = re.search(pattern, "Call 555-1234")
match.group(1)  # "555"
match.group(2)  # "1234"

Flags

Modify pattern behavior:

# Case insensitive
re.search(r"hello", "HELLO", re.IGNORECASE)
re.search(r"hello", "HELLO", re.I)
 
# Multiline (^ and $ match line boundaries)
re.findall(r"^\w+", "line1\nline2", re.MULTILINE)
 
# Dot matches newline
re.search(r"a.b", "a\nb", re.DOTALL)
 
# Verbose (allow comments)
pattern = re.compile(r"""
    \d{3}    # Area code
    -        # Separator
    \d{4}    # Number
""", re.VERBOSE)

Common Patterns

Email

email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
re.findall(email_pattern, text)

URL

url_pattern = r"https?://[^\s]+"
re.findall(url_pattern, text)

Phone number

phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
re.findall(phone_pattern, text)

IP address

ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
re.findall(ip_pattern, text)

Substitution

# Simple replace
re.sub(r"\d+", "X", "abc123def456")  # "abcXdefX"
 
# Using groups
re.sub(r"(\w+), (\w+)", r"\2 \1", "Doe, John")  # "John Doe"
 
# With function
def double(match):
    return str(int(match.group()) * 2)
 
re.sub(r"\d+", double, "5 cats and 3 dogs")  # "10 cats and 6 dogs"
 
# Limit replacements
re.sub(r"a", "X", "banana", count=2)  # "bXnXna"

Splitting

# Split on pattern
re.split(r"\s+", "hello   world")  # ["hello", "world"]
 
# Keep delimiters
re.split(r"(\s+)", "a b  c")  # ["a", " ", "b", "  ", "c"]
 
# Limit splits
re.split(r",", "a,b,c,d", maxsplit=2)  # ["a", "b", "c,d"]

Lookahead and Lookbehind

Match without consuming:

# Positive lookahead (?=...)
re.findall(r"\w+(?=@)", "user@example.com")  # ["user"]
 
# Negative lookahead (?!...)
re.findall(r"\d+(?!%)", "50% and 100")  # ["100"]
 
# Positive lookbehind (?<=...)
re.findall(r"(?<=\$)\d+", "$50 and $100")  # ["50", "100"]
 
# Negative lookbehind (?<!...)
re.findall(r"(?<!\$)\d+", "50 and $100")  # ["50"]

Non-Greedy Matching

text = "<tag>content</tag>"
 
# Greedy (default)
re.search(r"<.*>", text).group()  # "<tag>content</tag>"
 
# Non-greedy
re.search(r"<.*?>", text).group()  # "<tag>"

Practical Examples

Parse log line

log = "2026-03-21 14:30:00 ERROR: Connection failed"
pattern = r"(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2}) (?P<level>\w+): (?P<message>.+)"
match = re.match(pattern, log)
match.groupdict()

Extract data from HTML

html = '<a href="https://example.com">Link</a>'
pattern = r'href="([^"]+)"'
re.findall(pattern, html)  # ["https://example.com"]

Validate input

def is_valid_username(username):
    pattern = r"^[a-zA-Z][a-zA-Z0-9_]{2,19}$"
    return bool(re.match(pattern, username))

Clean whitespace

def normalize_whitespace(text):
    return re.sub(r"\s+", " ", text.strip())

Quick Reference

import re
 
# Search
re.search(pattern, text)     # First match
re.match(pattern, text)      # Match at start
re.fullmatch(pattern, text)  # Match entire string
 
# Find all
re.findall(pattern, text)    # List of matches
re.finditer(pattern, text)   # Iterator of Match objects
 
# Replace
re.sub(pattern, repl, text)
re.subn(pattern, repl, text)  # Also returns count
 
# Split
re.split(pattern, text)
 
# Compile
pattern = re.compile(r"...")

Use raw strings (r"...") for patterns to avoid escape issues. For complex text processing, regex is your friend.

React to this post:

#Basic Functions

#Pattern Syntax

#Basic Patterns

#Character Classes

#Compiling Patterns

#Groups

#Flags

#Common Patterns

#Email

#URL

#Phone number

#IP address

#Substitution

#Splitting

#Lookahead and Lookbehind

#Non-Greedy Matching

#Practical Examples

#Parse log line

#Extract data from HTML

#Validate input

#Clean whitespace

#Quick Reference

Keep Reading

Need help shipping fast?