pickle serializes arbitrary Python objects to bytes. Powerful but dangerous—here's what you need to know.
Basic Usage
import pickle
# Serialize (dump)
data = {"name": "Alice", "scores": [95, 87, 92]}
serialized = pickle.dumps(data)
# Deserialize (load)
restored = pickle.loads(serialized)
print(restored) # {'name': 'Alice', 'scores': [95, 87, 92]}File Operations
import pickle
data = {"key": "value", "numbers": [1, 2, 3]}
# Write to file
with open("data.pkl", "wb") as f:
pickle.dump(data, f)
# Read from file
with open("data.pkl", "rb") as f:
loaded = pickle.load(f)Protocols
import pickle
data = {"test": True}
# Protocol 0: ASCII, human-readable (slow)
pickle.dumps(data, protocol=0)
# Protocol 4: Python 3.4+ (default in 3.8+)
pickle.dumps(data, protocol=4)
# Protocol 5: Python 3.8+ (out-of-band data)
pickle.dumps(data, protocol=5)
# Highest available protocol
pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
# Check default protocol
print(pickle.DEFAULT_PROTOCOL) # 4 or 5What Can Be Pickled
import pickle
# These work:
pickle.dumps(None)
pickle.dumps(True)
pickle.dumps(42)
pickle.dumps(3.14)
pickle.dumps("hello")
pickle.dumps(b"bytes")
pickle.dumps([1, 2, 3])
pickle.dumps({"a": 1})
pickle.dumps((1, 2))
pickle.dumps({1, 2, 3})
# Classes defined at module level
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
pickle.dumps(Point(1, 2)) # Works
# These don't work:
# pickle.dumps(lambda x: x) # Can't pickle lambdas
# pickle.dumps(open("file.txt")) # Can't pickle file handlesCustom Pickling
import pickle
class Connection:
def __init__(self, host, port):
self.host = host
self.port = port
self.socket = None # Can't pickle sockets
def __getstate__(self):
# Return state to pickle (exclude socket)
state = self.__dict__.copy()
del state["socket"]
return state
def __setstate__(self, state):
# Restore state from pickle
self.__dict__.update(state)
self.socket = None # Reconnect later
conn = Connection("localhost", 8080)
data = pickle.dumps(conn)
restored = pickle.loads(data)Reduce Protocol
import pickle
class DatabaseConnection:
def __init__(self, url):
self.url = url
self._connect()
def _connect(self):
# Simulate connection
self.connection = f"Connected to {self.url}"
def __reduce__(self):
# Return (callable, args) to reconstruct
return (self.__class__, (self.url,))
conn = DatabaseConnection("postgres://localhost/db")
restored = pickle.loads(pickle.dumps(conn))
print(restored.connection) # Reconnected⚠️ Security Warning
import pickle
# NEVER unpickle untrusted data!
# Pickle can execute arbitrary code:
class Evil:
def __reduce__(self):
import os
return (os.system, ("echo PWNED",))
# This would run the command:
# pickle.loads(pickle.dumps(Evil()))
# Safe alternatives for untrusted data:
# - JSON (json module)
# - MessagePack (msgpack)
# - Protocol BuffersRestricting Unpickling
import pickle
import io
class RestrictedUnpickler(pickle.Unpickler):
ALLOWED_CLASSES = {
("builtins", "dict"),
("builtins", "list"),
("builtins", "set"),
("builtins", "tuple"),
}
def find_class(self, module, name):
if (module, name) in self.ALLOWED_CLASSES:
return super().find_class(module, name)
raise pickle.UnpicklingError(
f"Class {module}.{name} not allowed"
)
def safe_loads(data):
return RestrictedUnpickler(io.BytesIO(data)).load()
# Only allows basic types
safe_loads(pickle.dumps([1, 2, 3])) # OK
# safe_loads(pickle.dumps(SomeClass())) # Raises errorPickling with Slots
import pickle
class Point:
__slots__ = ["x", "y"]
def __init__(self, x, y):
self.x = x
self.y = y
def __getstate__(self):
return {"x": self.x, "y": self.y}
def __setstate__(self, state):
self.x = state["x"]
self.y = state["y"]
p = Point(1, 2)
restored = pickle.loads(pickle.dumps(p))Persistent References
import pickle
import io
class Database:
def __init__(self):
self.objects = {}
def store(self, obj_id, obj):
self.objects[obj_id] = obj
def fetch(self, obj_id):
return self.objects[obj_id]
class DatabasePickler(pickle.Pickler):
def __init__(self, file, db):
super().__init__(file)
self.db = db
def persistent_id(self, obj):
if hasattr(obj, "db_id"):
return ("db", obj.db_id)
return None
class DatabaseUnpickler(pickle.Unpickler):
def __init__(self, file, db):
super().__init__(file)
self.db = db
def persistent_load(self, pid):
if pid[0] == "db":
return self.db.fetch(pid[1])
raise pickle.UnpicklingError(f"Unknown persistent id: {pid}")Pickling Large Objects
import pickle
# For large numpy arrays, use protocol 5
import numpy as np
arr = np.zeros((10000, 10000))
# Protocol 5 with out-of-band buffers
buffers = []
data = pickle.dumps(arr, protocol=5, buffer_callback=buffers.append)
# Restore
restored = pickle.loads(data, buffers=buffers)Common Patterns
import pickle
from pathlib import Path
def save_object(obj, path):
"""Save object to file."""
Path(path).write_bytes(pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL))
def load_object(path):
"""Load object from file."""
return pickle.loads(Path(path).read_bytes())
def clone(obj):
"""Deep copy via pickle."""
return pickle.loads(pickle.dumps(obj))
# Cache decorator
def disk_cache(path):
def decorator(func):
def wrapper(*args, **kwargs):
cache_file = Path(path)
if cache_file.exists():
return load_object(cache_file)
result = func(*args, **kwargs)
save_object(result, cache_file)
return result
return wrapper
return decoratorAlternatives to Pickle
# JSON - safe, portable, text-based
import json
json.dumps({"key": "value"})
# MessagePack - fast, compact binary
import msgpack # pip install msgpack
msgpack.packb({"key": "value"})
# Protocol Buffers - schema-based, cross-language
# Requires .proto definition and compilation
# Cloudpickle - extended pickle for lambdas
import cloudpickle # pip install cloudpickle
cloudpickle.dumps(lambda x: x * 2)
# Dill - extended pickle for more types
import dill # pip install dill
dill.dumps(lambda x: x * 2)When to Use Pickle
Good for:
- Caching computed results locally
- Saving/loading ML models (with trusted sources)
- Inter-process communication (same machine)
- Development and debugging
Avoid for:
- Network protocols (use JSON, protobuf)
- Long-term storage (schema changes break it)
- Untrusted data (security risk)
- Cross-language systems (Python-only)
Best Practices
# Always use highest protocol for speed/size
pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
# Use binary mode for files
with open("data.pkl", "wb") as f:
pickle.dump(obj, f)
# Never unpickle untrusted data
# If you must, use RestrictedUnpickler
# Document pickle format versions
# Objects pickled with one Python version
# may not unpickle with another
# Consider alternatives for production
# JSON for APIs, protobuf for cross-languagePickle is convenient for internal use but not for untrusted data or long-term storage. Use JSON or protocol buffers when security or portability matter.
React to this post: