The html module handles HTML entity encoding. Essential for preventing XSS attacks and safely embedding text in HTML.
html.escape
Convert special characters to HTML entities:
import html
# Escape HTML special characters
text = '<script>alert("XSS")</script>'
safe = html.escape(text)
print(safe)
# <script>alert("XSS")</script>
# Now safe to embed in HTML
html_output = f"<p>{safe}</p>"What Gets Escaped
import html
# These characters are escaped:
html.escape('<') # <
html.escape('>') # >
html.escape('&') # &
html.escape('"') # "
html.escape("'") # ' (only with quote=True)
# By default, single quotes are NOT escaped
html.escape("'") # "'" (unchanged)
html.escape("'", quote=True) # "'"html.unescape
Convert entities back to characters:
import html
# Unescape named entities
html.unescape('<div>') # '<div>'
html.unescape('&') # '&'
html.unescape('"') # '"'
# Numeric entities
html.unescape('<') # '<'
html.unescape('<') # '<' (hex)
# Named entities
html.unescape('©') # '©'
html.unescape('€') # '€'
html.unescape(' ') # '\xa0' (non-breaking space)Practical Examples
Safe Template Rendering
import html
def render_comment(user_input):
"""Safely render user input in HTML."""
safe_text = html.escape(user_input)
return f"""
<div class="comment">
<p>{safe_text}</p>
</div>
"""
# User tries XSS
malicious = '<img src=x onerror=alert("hacked")>'
print(render_comment(malicious))
# <div class="comment">
# <p><img src=x onerror=alert("hacked")></p>
# </div>Safe Attribute Values
import html
def create_link(url, text):
"""Create link with escaped values."""
safe_url = html.escape(url, quote=True)
safe_text = html.escape(text)
return f'<a href="{safe_url}">{safe_text}</a>'
# Safe even with quotes in input
link = create_link('http://example.com"onclick="alert(1)', 'Click me')
# <a href="http://example.com"onclick="alert(1)">Click me</a>Parse HTML Entities from APIs
import html
import json
def clean_api_response(data):
"""Unescape HTML entities in API response."""
if isinstance(data, str):
return html.unescape(data)
elif isinstance(data, dict):
return {k: clean_api_response(v) for k, v in data.items()}
elif isinstance(data, list):
return [clean_api_response(item) for item in data]
return data
response = {
'title': 'Tom & Jerry',
'description': 'A "classic" cartoon'
}
cleaned = clean_api_response(response)
# {'title': 'Tom & Jerry', 'description': 'A "classic" cartoon'}Generate Safe HTML
import html
class HTMLBuilder:
def __init__(self):
self.parts = []
def add_text(self, text):
"""Add escaped text."""
self.parts.append(html.escape(text))
return self
def add_raw(self, html_content):
"""Add raw HTML (use with caution)."""
self.parts.append(html_content)
return self
def add_element(self, tag, content, **attrs):
"""Add element with escaped content and attributes."""
attr_str = ' '.join(
f'{k}="{html.escape(str(v), quote=True)}"'
for k, v in attrs.items()
)
safe_content = html.escape(content)
if attr_str:
self.parts.append(f'<{tag} {attr_str}>{safe_content}</{tag}>')
else:
self.parts.append(f'<{tag}>{safe_content}</{tag}>')
return self
def build(self):
return ''.join(self.parts)
# Usage
builder = HTMLBuilder()
builder.add_element('h1', 'Hello <World>')
builder.add_element('a', 'Click here', href='https://example.com')
print(builder.build())
# <h1>Hello <World></h1><a href="https://example.com">Click here</a>Convert Plain Text to HTML
import html
def text_to_html(text):
"""Convert plain text to HTML, preserving whitespace."""
# Escape HTML characters
safe = html.escape(text)
# Convert newlines to <br>
safe = safe.replace('\n', '<br>\n')
# Preserve multiple spaces
safe = safe.replace(' ', ' ')
return safe
text = """Line 1
Line 2 with extra spaces
<script>alert('xss')</script>"""
print(text_to_html(text))html vs Other Escaping
import html
import urllib.parse
import json
text = '<script>"test"</script>'
# HTML escaping (for HTML content)
html.escape(text)
# '<script>"test"</script>'
# URL escaping (for URLs)
urllib.parse.quote(text)
# '%3Cscript%3E%22test%22%3C%2Fscript%3E'
# JSON escaping (for JSON strings)
json.dumps(text)
# '"<script>\\"test\\"</script>"'Common Entities
import html
# Common named entities
html.unescape(' ') # Non-breaking space
html.unescape('©') # ©
html.unescape('®') # ®
html.unescape('™') # ™
html.unescape('—') # —
html.unescape('–') # –
html.unescape('…') # …
html.unescape('€') # €
html.unescape('£') # £Quick Reference
import html
# Escape: text → HTML safe
html.escape(text)
html.escape(text, quote=True) # Also escape ' and "
# Unescape: entities → text
html.unescape(html_text)| Character | Escaped | Named Entity |
|---|---|---|
< | < | Less than |
> | > | Greater than |
& | & | Ampersand |
" | " | Quote |
' | ' | Apostrophe |
Rule: Always html.escape() user input before embedding in HTML. No exceptions.
React to this post: