Python html Module: HTML Entity Handling

The html module handles HTML entity encoding. Essential for preventing XSS attacks and safely embedding text in HTML.

html.escape

Convert special characters to HTML entities:

import html
 
# Escape HTML special characters
text = '<script>alert("XSS")</script>'
safe = html.escape(text)
print(safe)
# &lt;script&gt;alert("XSS")&lt;/script&gt;
 
# Now safe to embed in HTML
html_output = f"<p>{safe}</p>"

What Gets Escaped

import html
 
# These characters are escaped:
html.escape('<')   # &lt;
html.escape('>')   # &gt;
html.escape('&')   # &amp;
html.escape('"')   # &quot;
html.escape("'")   # &#x27; (only with quote=True)
 
# By default, single quotes are NOT escaped
html.escape("'")           # "'" (unchanged)
html.escape("'", quote=True)  # "&#x27;"

html.unescape

Convert entities back to characters:

import html
 
# Unescape named entities
html.unescape('&lt;div&gt;')  # '<div>'
html.unescape('&amp;')        # '&'
html.unescape('&quot;')       # '"'
 
# Numeric entities
html.unescape('&#60;')        # '<'
html.unescape('&#x3C;')       # '<' (hex)
 
# Named entities
html.unescape('&copy;')       # '©'
html.unescape('&euro;')       # '€'
html.unescape('&nbsp;')       # '\xa0' (non-breaking space)

Practical Examples

Safe Template Rendering

import html
 
def render_comment(user_input):
    """Safely render user input in HTML."""
    safe_text = html.escape(user_input)
    return f"""
    <div class="comment">
        <p>{safe_text}</p>
    </div>
    """
 
# User tries XSS
malicious = '<img src=x onerror=alert("hacked")>'
print(render_comment(malicious))
# <div class="comment">
#     <p>&lt;img src=x onerror=alert("hacked")&gt;</p>
# </div>

Safe Attribute Values

import html
 
def create_link(url, text):
    """Create link with escaped values."""
    safe_url = html.escape(url, quote=True)
    safe_text = html.escape(text)
    return f'<a href="{safe_url}">{safe_text}</a>'
 
# Safe even with quotes in input
link = create_link('http://example.com"onclick="alert(1)', 'Click me')
# <a href="http://example.com&quot;onclick=&quot;alert(1)">Click me</a>

Parse HTML Entities from APIs

import html
import json
 
def clean_api_response(data):
    """Unescape HTML entities in API response."""
    if isinstance(data, str):
        return html.unescape(data)
    elif isinstance(data, dict):
        return {k: clean_api_response(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [clean_api_response(item) for item in data]
    return data
 
response = {
    'title': 'Tom &amp; Jerry',
    'description': 'A &quot;classic&quot; cartoon'
}
cleaned = clean_api_response(response)
# {'title': 'Tom & Jerry', 'description': 'A "classic" cartoon'}

Generate Safe HTML

import html
 
class HTMLBuilder:
    def __init__(self):
        self.parts = []
    
    def add_text(self, text):
        """Add escaped text."""
        self.parts.append(html.escape(text))
        return self
    
    def add_raw(self, html_content):
        """Add raw HTML (use with caution)."""
        self.parts.append(html_content)
        return self
    
    def add_element(self, tag, content, **attrs):
        """Add element with escaped content and attributes."""
        attr_str = ' '.join(
            f'{k}="{html.escape(str(v), quote=True)}"'
            for k, v in attrs.items()
        )
        safe_content = html.escape(content)
        if attr_str:
            self.parts.append(f'<{tag} {attr_str}>{safe_content}</{tag}>')
        else:
            self.parts.append(f'<{tag}>{safe_content}</{tag}>')
        return self
    
    def build(self):
        return ''.join(self.parts)
 
# Usage
builder = HTMLBuilder()
builder.add_element('h1', 'Hello <World>')
builder.add_element('a', 'Click here', href='https://example.com')
print(builder.build())
# <h1>Hello &lt;World&gt;</h1><a href="https://example.com">Click here</a>

Convert Plain Text to HTML

import html
 
def text_to_html(text):
    """Convert plain text to HTML, preserving whitespace."""
    # Escape HTML characters
    safe = html.escape(text)
    # Convert newlines to <br>
    safe = safe.replace('\n', '<br>\n')
    # Preserve multiple spaces
    safe = safe.replace('  ', '&nbsp;&nbsp;')
    return safe
 
text = """Line 1
Line 2 with  extra spaces
<script>alert('xss')</script>"""
 
print(text_to_html(text))

html vs Other Escaping

import html
import urllib.parse
import json
 
text = '<script>"test"</script>'
 
# HTML escaping (for HTML content)
html.escape(text)
# '&lt;script&gt;"test"&lt;/script&gt;'
 
# URL escaping (for URLs)
urllib.parse.quote(text)
# '%3Cscript%3E%22test%22%3C%2Fscript%3E'
 
# JSON escaping (for JSON strings)
json.dumps(text)
# '"<script>\\"test\\"</script>"'

Common Entities

import html
 
# Common named entities
html.unescape('&nbsp;')   # Non-breaking space
html.unescape('&copy;')   # ©
html.unescape('&reg;')    # ®
html.unescape('&trade;')  # ™
html.unescape('&mdash;')  # —
html.unescape('&ndash;')  # –
html.unescape('&hellip;') # …
html.unescape('&euro;')   # €
html.unescape('&pound;')  # £

Quick Reference

import html
 
# Escape: text → HTML safe
html.escape(text)
html.escape(text, quote=True)  # Also escape ' and "
 
# Unescape: entities → text
html.unescape(html_text)

Character	Escaped	Named Entity
`<`	`<`	Less than
`>`	`>`	Greater than
`&`	`&`	Ampersand
`"`	`"`	Quote
`'`	`'`	Apostrophe

Rule: Always html.escape() user input before embedding in HTML. No exceptions.

React to this post:

#html.escape

#What Gets Escaped

#html.unescape

#Practical Examples

#Safe Template Rendering

#Safe Attribute Values

#Parse HTML Entities from APIs

#Generate Safe HTML

#Convert Plain Text to HTML

#html vs Other Escaping

#Common Entities

#Quick Reference

Keep Reading

Need help shipping fast?