When I first learned about pickle, I thought it was magic. Serialize any Python object to bytes and back? Amazing! Then I learned it can execute arbitrary code on deserialization, and suddenly that magic felt more like a loaded gun. Here's everything I've learned about using pickle safely (and when not to use it at all).
The Basics: dump, dumps, load, loads
Pickle has four main functions. The "s" suffix means "string" (actually bytes):
import pickle
# dumps: serialize to bytes
data = {"user": "alice", "scores": [95, 87, 92]}
serialized = pickle.dumps(data)
print(type(serialized)) # <class 'bytes'>
print(serialized[:20]) # b'\x80\x05\x95&\x00\x00\x00...
# loads: deserialize from bytes
restored = pickle.loads(serialized)
print(restored) # {'user': 'alice', 'scores': [95, 87, 92]}
# dump: serialize to file
with open("data.pkl", "wb") as f: # Binary mode!
pickle.dump(data, f)
# load: deserialize from file
with open("data.pkl", "rb") as f:
loaded = pickle.load(f)The "wb" and "rb" modes are crucial—pickle produces binary data, not text.
Protocol Versions
Pickle has evolved through several protocol versions:
import pickle
data = {"key": "value"}
# Protocol 0: ASCII, human-readable, slow (Python 1.x)
p0 = pickle.dumps(data, protocol=0)
print(p0) # b'(dp0\nVkey\np1\nVvalue\np2\ns.'
# Protocol 1: Binary, more efficient (Python 1.x)
p1 = pickle.dumps(data, protocol=1)
# Protocol 2: New-style classes (Python 2.3)
p2 = pickle.dumps(data, protocol=2)
# Protocol 3: Python 3 only, bytes support (Python 3.0)
p3 = pickle.dumps(data, protocol=3)
# Protocol 4: Large objects, more types (Python 3.4)
p4 = pickle.dumps(data, protocol=4)
# Protocol 5: Out-of-band buffers (Python 3.8)
p5 = pickle.dumps(data, protocol=5)
# Always use highest for your Python version
pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
# Check current default
print(pickle.DEFAULT_PROTOCOL) # 4 or 5 depending on Python versionRule of thumb: Use protocol=pickle.HIGHEST_PROTOCOL unless you need compatibility with older Python versions.
Pickle vs JSON: When to Use Each
This confused me for a while. Here's the mental model I use:
| Aspect | JSON | Pickle |
|---|---|---|
| Safety | ✅ Safe with untrusted data | ❌ NEVER use with untrusted data |
| Speed | Slower | Faster |
| Types | Limited (dict, list, str, int, float, bool, null) | Any Python object |
| Portability | Cross-language | Python only |
| Human-readable | Yes | No (binary) |
| Long-term storage | Good | Bad (breaks with code changes) |
import json
import pickle
from datetime import datetime
data = {"name": "Alice", "created": datetime.now()}
# JSON: Can't handle datetime directly
try:
json.dumps(data)
except TypeError as e:
print(f"JSON error: {e}")
# Must convert: json.dumps(data, default=str)
# Pickle: Handles anything
pickle.dumps(data) # Works fineUse JSON when:
- Data comes from external sources (APIs, users)
- You need cross-language compatibility
- Data will be stored long-term
- You want human-readable output
Use pickle when:
- Data is trusted (your own code, same machine)
- You need to serialize complex Python objects
- Speed matters and you control both ends
- Short-term caching only
⚠️ THE SECURITY WARNING ⚠️
This is the most important section. Never unpickle data from untrusted sources.
Here's why—this is a real exploit:
import pickle
import os
class MaliciousPayload:
def __reduce__(self):
# This executes when unpickled!
return (os.system, ("echo 'You have been pwned!' && whoami",))
# Create the payload
evil_bytes = pickle.dumps(MaliciousPayload())
# Somewhere else, an unsuspecting victim loads it...
pickle.loads(evil_bytes) # RUNS THE COMMANDWhen I run this, it actually executes whoami on my system. An attacker could:
- Delete files:
rm -rf / - Install malware
- Exfiltrate data
- Create backdoors
- Anything the Python process can do
Real-world scenarios where this is dangerous:
# ❌ NEVER DO THIS
# Loading pickle from a web request
@app.route("/import", methods=["POST"])
def import_data():
data = pickle.loads(request.data) # VULNERABLE!
return "OK"
# Loading pickle from a database that could be compromised
def load_user_session(session_id):
session_data = redis.get(f"session:{session_id}")
return pickle.loads(session_data) # VULNERABLE if Redis is compromised
# Loading pickle from a file upload
def process_upload(file):
return pickle.load(file) # VULNERABLE!
# Loading pickle from a message queue
def handle_message(msg):
task = pickle.loads(msg.body) # VULNERABLE!reduce and Custom Pickling
The __reduce__ method controls how an object is pickled and unpickled:
import pickle
class DatabaseConnection:
def __init__(self, host, port):
self.host = host
self.port = port
self.socket = self._connect()
def _connect(self):
# Simulate opening a connection
return f"socket({self.host}:{self.port})"
def __reduce__(self):
# Return (callable, args) - callable(*args) reconstructs the object
return (self.__class__, (self.host, self.port))
conn = DatabaseConnection("localhost", 5432)
print(conn.socket) # socket(localhost:5432)
# Pickle and unpickle
restored = pickle.loads(pickle.dumps(conn))
print(restored.socket) # socket(localhost:5432) - reconnected!For more control, use __getstate__ and __setstate__:
import pickle
class CachedResource:
def __init__(self, name):
self.name = name
self.cache = {} # Don't want to pickle this
self.connection = self._connect() # Can't pickle sockets
def _connect(self):
return f"connected:{self.name}"
def __getstate__(self):
# Called when pickling - return what to save
state = self.__dict__.copy()
del state["cache"]
del state["connection"]
return state
def __setstate__(self, state):
# Called when unpickling - restore from saved state
self.__dict__.update(state)
self.cache = {} # Fresh cache
self.connection = self._connect() # Reconnect
resource = CachedResource("mydb")
resource.cache["key"] = "value"
restored = pickle.loads(pickle.dumps(resource))
print(restored.name) # mydb
print(restored.cache) # {} (fresh)
print(restored.connection) # connected:mydb (reconnected)Common Patterns
Disk Caching
import pickle
from pathlib import Path
from functools import wraps
def disk_cache(cache_path):
"""Cache function results to disk using pickle."""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
path = Path(cache_path)
if path.exists():
try:
return pickle.loads(path.read_bytes())
except Exception:
pass # Cache corrupted, recompute
result = func(*args, **kwargs)
path.write_bytes(pickle.dumps(result, protocol=pickle.HIGHEST_PROTOCOL))
return result
return wrapper
return decorator
@disk_cache("expensive_result.pkl")
def expensive_computation():
# This only runs once, result is cached
import time
time.sleep(2)
return {"computed": True}Session Storage
import pickle
import hashlib
import hmac
from pathlib import Path
SECRET_KEY = b"your-secret-key-here" # In production, use env var
def sign_data(data: bytes) -> bytes:
"""Add HMAC signature to data."""
signature = hmac.new(SECRET_KEY, data, hashlib.sha256).digest()
return signature + data
def verify_and_load(signed_data: bytes):
"""Verify signature and load data."""
signature = signed_data[:32]
data = signed_data[32:]
expected = hmac.new(SECRET_KEY, data, hashlib.sha256).digest()
if not hmac.compare_digest(signature, expected):
raise ValueError("Invalid signature - data may be tampered!")
return pickle.loads(data)
# Usage
session = {"user_id": 123, "permissions": ["read", "write"]}
signed = sign_data(pickle.dumps(session))
# Later...
restored = verify_and_load(signed)Note: This only protects against tampering, not against malicious data created by someone who knows your secret key.
ML Model Persistence
import pickle
from pathlib import Path
def save_model(model, path, metadata=None):
"""Save ML model with metadata."""
data = {
"model": model,
"metadata": metadata or {},
"version": "1.0",
}
Path(path).write_bytes(
pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
)
def load_model(path):
"""Load ML model with metadata."""
data = pickle.loads(Path(path).read_bytes())
return data["model"], data["metadata"]
# Usage with sklearn
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
# ... train model ...
save_model(model, "model.pkl", {"trained_on": "2024-01-15", "accuracy": 0.95})
loaded_model, meta = load_model("model.pkl")Alternatives to pickle
JSON (Safe, Portable)
import json
from dataclasses import dataclass, asdict
@dataclass
class User:
name: str
age: int
# Works for simple data
user = User("Alice", 30)
json_str = json.dumps(asdict(user))
restored = User(**json.loads(json_str))msgpack (Fast, Compact)
# pip install msgpack
import msgpack
data = {"name": "Alice", "scores": [1, 2, 3]}
# Serialize (smaller than JSON, faster than pickle for simple data)
packed = msgpack.packb(data)
print(len(packed)) # ~25 bytes
# Deserialize
restored = msgpack.unpackb(packed)cloudpickle (Extended Pickle)
# pip install cloudpickle
import cloudpickle
# Can pickle lambdas and closures!
fn = lambda x: x * 2
serialized = cloudpickle.dumps(fn)
restored_fn = cloudpickle.loads(serialized)
print(restored_fn(21)) # 42
# Great for distributed computing (sending functions to workers)dill (Even More Extended)
# pip install dill
import dill
# Pickles almost anything
def outer():
x = 10
def inner(y):
return x + y
return inner
fn = outer()
serialized = dill.dumps(fn)
restored = dill.loads(serialized)
print(restored(5)) # 15joblib (Optimized for NumPy)
# pip install joblib
import joblib
import numpy as np
# Much faster for large numpy arrays
arr = np.random.rand(10000, 1000)
joblib.dump(arr, "array.joblib", compress=3)
restored = joblib.load("array.joblib")Restricted Unpickling
If you must unpickle potentially risky data, restrict what can be loaded:
import pickle
import io
class RestrictedUnpickler(pickle.Unpickler):
"""Only allow safe built-in types."""
SAFE_CLASSES = {
("builtins", "dict"),
("builtins", "list"),
("builtins", "tuple"),
("builtins", "set"),
("builtins", "frozenset"),
("builtins", "str"),
("builtins", "bytes"),
("builtins", "int"),
("builtins", "float"),
("builtins", "bool"),
("builtins", "type"), # For None
}
def find_class(self, module, name):
if (module, name) in self.SAFE_CLASSES:
return super().find_class(module, name)
raise pickle.UnpicklingError(
f"Forbidden: {module}.{name}"
)
def safe_loads(data: bytes):
"""Load pickle data with restricted types."""
return RestrictedUnpickler(io.BytesIO(data)).load()
# This works
safe_data = pickle.dumps({"key": [1, 2, 3]})
print(safe_loads(safe_data)) # {'key': [1, 2, 3]}
# This raises an error
import os
evil = pickle.dumps(os)
try:
safe_loads(evil)
except pickle.UnpicklingError as e:
print(f"Blocked: {e}") # Blocked: Forbidden: builtins.getattrBest Practices Summary
- Never unpickle untrusted data - This can't be stressed enough
- Use
protocol=pickle.HIGHEST_PROTOCOL- Better performance and features - Use binary file modes - Always
"wb"and"rb" - Sign pickled data - If storing/transmitting, add HMAC signature
- Consider alternatives - JSON for external data, msgpack for performance
- Don't use pickle for long-term storage - Code changes break pickled objects
- Document pickle versions - Note Python version in metadata
# My typical pickle usage
import pickle
from pathlib import Path
def cache_result(key: str, data):
"""Safe local caching only."""
path = Path(f".cache/{key}.pkl")
path.parent.mkdir(exist_ok=True)
path.write_bytes(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL))
def load_cached(key: str):
"""Load cached result."""
path = Path(f".cache/{key}.pkl")
if path.exists():
return pickle.loads(path.read_bytes())
return NonePickle is powerful but dangerous. Use it for local caching and trusted inter-process communication. For everything else, reach for JSON or a typed serialization format.