Python hashlib Patterns: Hashing Files, Passwords, and Data

The hashlib module provides secure hash functions for checksums, data integrity, and password storage.

Basic Hashing

import hashlib
 
# Hash a string
data = "Hello, World!"
hash_obj = hashlib.sha256(data.encode())
 
print(hash_obj.hexdigest())
# 'dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f'
 
print(hash_obj.digest())  # Raw bytes
print(hash_obj.digest_size)  # 32 bytes

Incremental Hashing

For large data or streaming:

import hashlib
 
hasher = hashlib.sha256()
hasher.update(b"chunk 1")
hasher.update(b"chunk 2")
hasher.update(b"chunk 3")
 
# Same as hashing "chunk 1chunk 2chunk 3"
print(hasher.hexdigest())

File Checksums

import hashlib
 
def file_hash(path, algorithm='sha256', chunk_size=8192):
    """Calculate hash of a file efficiently."""
    hasher = hashlib.new(algorithm)
    
    with open(path, 'rb') as f:
        while chunk := f.read(chunk_size):
            hasher.update(chunk)
    
    return hasher.hexdigest()
 
# Verify file integrity
checksum = file_hash('download.zip')
print(f"SHA-256: {checksum}")
 
# Compare with expected
expected = "abc123..."
if checksum == expected:
    print("File integrity verified")

Multiple Hash Algorithms

import hashlib
 
def multi_hash(data: bytes) -> dict:
    """Generate multiple hashes for the same data."""
    return {
        'md5': hashlib.md5(data).hexdigest(),
        'sha1': hashlib.sha1(data).hexdigest(),
        'sha256': hashlib.sha256(data).hexdigest(),
        'sha512': hashlib.sha512(data).hexdigest(),
    }
 
hashes = multi_hash(b"test data")
# {'md5': '...', 'sha1': '...', 'sha256': '...', 'sha512': '...'}

Available Algorithms

import hashlib
 
# Always available
print(hashlib.algorithms_guaranteed)
# {'sha256', 'sha384', 'sha512', 'sha1', 'md5', ...}
 
# Available on this system
print(hashlib.algorithms_available)
# May include 'sha3_256', 'blake2b', etc.
 
# Use by name
hasher = hashlib.new('sha3_256')
hasher.update(b"data")

BLAKE2 for Speed

BLAKE2 is faster than SHA-256 while being equally secure:

import hashlib
 
# BLAKE2b (optimized for 64-bit)
b2b = hashlib.blake2b(b"data", digest_size=32)
print(b2b.hexdigest())
 
# BLAKE2s (optimized for 32-bit)
b2s = hashlib.blake2s(b"data", digest_size=32)
print(b2s.hexdigest())
 
# Keyed hashing (MAC)
key = b"secret key"
mac = hashlib.blake2b(b"message", key=key, digest_size=32)

SHA-3 Family

import hashlib
 
# SHA-3 variants
sha3_256 = hashlib.sha3_256(b"data").hexdigest()
sha3_512 = hashlib.sha3_512(b"data").hexdigest()
 
# SHAKE (variable output length)
shake = hashlib.shake_256(b"data")
print(shake.hexdigest(32))  # 32 bytes output
print(shake.hexdigest(64))  # 64 bytes output

Password Hashing (Don't Use Raw hashlib!)

import hashlib
import os
 
# DON'T do this for passwords
# bad_hash = hashlib.sha256(password.encode()).hexdigest()
 
# DO use PBKDF2
def hash_password(password: str) -> tuple[bytes, bytes]:
    """Hash password with PBKDF2."""
    salt = os.urandom(32)
    key = hashlib.pbkdf2_hmac(
        'sha256',
        password.encode(),
        salt,
        iterations=100_000,
        dklen=32
    )
    return salt, key
 
def verify_password(password: str, salt: bytes, key: bytes) -> bool:
    """Verify password against stored hash."""
    new_key = hashlib.pbkdf2_hmac(
        'sha256',
        password.encode(),
        salt,
        iterations=100_000,
        dklen=32
    )
    return new_key == key
 
# Usage
salt, hashed = hash_password("my_password")
print(verify_password("my_password", salt, hashed))  # True

Content-Addressable Storage

import hashlib
from pathlib import Path
 
class ContentStore:
    """Store files by their content hash."""
    
    def __init__(self, store_dir: str):
        self.store = Path(store_dir)
        self.store.mkdir(exist_ok=True)
    
    def put(self, data: bytes) -> str:
        """Store data and return its hash."""
        content_hash = hashlib.sha256(data).hexdigest()
        
        # Use first 2 chars as directory (like Git)
        subdir = self.store / content_hash[:2]
        subdir.mkdir(exist_ok=True)
        
        path = subdir / content_hash
        if not path.exists():
            path.write_bytes(data)
        
        return content_hash
    
    def get(self, content_hash: str) -> bytes | None:
        """Retrieve data by hash."""
        path = self.store / content_hash[:2] / content_hash
        if path.exists():
            return path.read_bytes()
        return None
 
store = ContentStore('./content')
hash_id = store.put(b"my data")
data = store.get(hash_id)

Merkle Tree

import hashlib
 
def merkle_root(items: list[bytes]) -> str:
    """Calculate Merkle tree root hash."""
    if not items:
        return hashlib.sha256(b"").hexdigest()
    
    # Hash each item
    hashes = [hashlib.sha256(item).digest() for item in items]
    
    # Pad to even number
    while len(hashes) > 1:
        if len(hashes) % 2:
            hashes.append(hashes[-1])
        
        # Combine pairs
        hashes = [
            hashlib.sha256(hashes[i] + hashes[i+1]).digest()
            for i in range(0, len(hashes), 2)
        ]
    
    return hashes[0].hex()
 
# Usage
items = [b"tx1", b"tx2", b"tx3", b"tx4"]
root = merkle_root(items)

Cache Keys

import hashlib
import json
 
def cache_key(*args, **kwargs) -> str:
    """Generate deterministic cache key from arguments."""
    # Serialize arguments
    key_data = json.dumps(
        {'args': args, 'kwargs': kwargs},
        sort_keys=True,
        default=str
    )
    
    # Short hash for cache key
    return hashlib.md5(key_data.encode()).hexdigest()[:16]
 
# Usage
key = cache_key('users', page=1, limit=10)
# 'a1b2c3d4e5f67890'

Data Deduplication

import hashlib
from pathlib import Path
 
def find_duplicates(directory: str) -> dict[str, list[Path]]:
    """Find duplicate files by content hash."""
    hash_to_files: dict[str, list[Path]] = {}
    
    for path in Path(directory).rglob('*'):
        if path.is_file():
            with open(path, 'rb') as f:
                file_hash = hashlib.sha256(f.read()).hexdigest()
            
            hash_to_files.setdefault(file_hash, []).append(path)
    
    # Return only duplicates
    return {h: files for h, files in hash_to_files.items() if len(files) > 1}
 
duplicates = find_duplicates('./documents')
for hash_val, files in duplicates.items():
    print(f"Duplicates ({hash_val[:8]}...):")
    for f in files:
        print(f"  {f}")

Hashing Algorithm Comparison

Algorithm	Output	Speed	Security	Use Case
MD5	128-bit	Fast	Broken	Checksums only
SHA-1	160-bit	Fast	Weak	Legacy systems
SHA-256	256-bit	Good	Strong	General purpose
SHA-512	512-bit	Good	Strong	High security
BLAKE2b	Variable	Fast	Strong	Performance-critical
SHA-3	Variable	Moderate	Strong	Post-quantum prep

Rule of thumb: Use SHA-256 for general hashing, BLAKE2 for speed, PBKDF2/bcrypt/Argon2 for passwords.

React to this post:

#Basic Hashing

#Incremental Hashing

#File Checksums

#Multiple Hash Algorithms

#Available Algorithms

#BLAKE2 for Speed

#SHA-3 Family

#Password Hashing (Don't Use Raw hashlib!)

#Content-Addressable Storage

#Merkle Tree

#Cache Keys

#Data Deduplication

#Hashing Algorithm Comparison

Need help shipping fast?