Python mmap: Memory-Mapped Files for Large Data

Memory-mapped files let you work with large files as if they were in memory. The OS handles paging data in and out—you get performance without loading everything into RAM.

Basic Memory Mapping

import mmap
 
# Read-only mapping
with open('large_file.bin', 'rb') as f:
    with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
        # Access like bytes
        print(mm[:100])  # First 100 bytes
        print(mm[-100:])  # Last 100 bytes
        
        # Random access is fast
        print(mm[1000000:1000010])
 
# Read-write mapping
with open('file.bin', 'r+b') as f:
    with mmap.mmap(f.fileno(), 0) as mm:
        mm[0:5] = b'hello'

Searching Large Files

import mmap
 
def search_in_file(path: str, pattern: bytes) -> list[int]:
    """Find all occurrences of pattern in file."""
    positions = []
    
    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            pos = 0
            while True:
                pos = mm.find(pattern, pos)
                if pos == -1:
                    break
                positions.append(pos)
                pos += 1
    
    return positions
 
# Search multi-GB file efficiently
matches = search_in_file('huge_log.txt', b'ERROR')
print(f"Found {len(matches)} errors")

Line-by-Line Processing

import mmap
 
def count_lines(path: str) -> int:
    """Count lines without loading entire file."""
    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            return mm.count(b'\n')
 
def find_line(path: str, line_number: int) -> bytes:
    """Get specific line from large file."""
    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            current_line = 0
            start = 0
            
            for i, byte in enumerate(mm):
                if byte == ord('\n'):
                    if current_line == line_number:
                        return mm[start:i]
                    current_line += 1
                    start = i + 1
            
            # Last line without newline
            if current_line == line_number:
                return mm[start:]
    
    raise IndexError(f"Line {line_number} not found")

Binary Data Processing

import mmap
import struct
 
def read_binary_records(path: str, record_size: int):
    """Read fixed-size binary records."""
    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            offset = 0
            while offset + record_size <= len(mm):
                record = mm[offset:offset + record_size]
                yield record
                offset += record_size
 
# Example: read 4-byte integers
for record in read_binary_records('data.bin', 4):
    value = struct.unpack('i', record)[0]
    print(value)

Modifying Files In-Place

import mmap
 
def replace_in_file(path: str, old: bytes, new: bytes) -> int:
    """Replace all occurrences in file (same length only)."""
    if len(old) != len(new):
        raise ValueError("Replacement must be same length")
    
    count = 0
    with open(path, 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            pos = 0
            while True:
                pos = mm.find(old, pos)
                if pos == -1:
                    break
                mm[pos:pos + len(old)] = new
                count += 1
                pos += len(old)
    
    return count
 
# Replace in-place
replaced = replace_in_file('config.txt', b'localhost', b'127.0.0.1')

Partial Mapping

Map only part of a file:

import mmap
import os
 
file_size = os.path.getsize('huge_file.bin')
chunk_size = 100 * 1024 * 1024  # 100 MB chunks
 
def process_in_chunks(path: str):
    """Process large file in chunks."""
    with open(path, 'rb') as f:
        offset = 0
        while offset < file_size:
            length = min(chunk_size, file_size - offset)
            
            with mmap.mmap(f.fileno(), length, 
                          offset=offset, 
                          access=mmap.ACCESS_READ) as mm:
                process_chunk(mm)
            
            offset += length

Anonymous Memory Mapping

Shared memory without a file:

import mmap
 
# Create anonymous shared memory
shared = mmap.mmap(-1, 1024)  # 1KB
 
# Write data
shared[:5] = b'hello'
 
# Read data
print(shared[:5])  # b'hello'
 
# Clean up
shared.close()

Inter-Process Communication

import mmap
import os
 
# Process 1: Create shared memory file
def writer():
    with open('/tmp/shared_mem', 'wb') as f:
        f.write(b'\x00' * 1024)
    
    with open('/tmp/shared_mem', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            mm[:5] = b'ready'
            # Wait for reader...
 
# Process 2: Read from shared memory
def reader():
    with open('/tmp/shared_mem', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            while mm[:5] != b'ready':
                pass  # Wait
            print("Data ready!")

Memory-Mapped Numpy Arrays

import mmap
import numpy as np
 
# Create memory-mapped array (NumPy has built-in support)
arr = np.memmap('array.dat', dtype='float64', 
                mode='w+', shape=(10000, 10000))
 
# Work with it like normal array
arr[0, 0] = 1.0
arr[1000:2000, :] = np.random.random((1000, 10000))
 
# Changes automatically persist
del arr  # Flushes to disk

Regex on Large Files

import mmap
import re
 
def regex_search(path: str, pattern: str) -> list:
    """Search large file with regex."""
    compiled = re.compile(pattern.encode())
    
    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            return compiled.findall(mm)
 
# Find all email addresses in large log
emails = regex_search('access.log', r'[\w.-]+@[\w.-]+\.\w+')

mmap Access Modes

import mmap
 
# Read-only
mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
 
# Read-write (changes written to file)
mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_WRITE)
 
# Copy-on-write (changes not written to file)
mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_COPY)

Performance Comparison

import mmap
import time
 
def read_normal(path: str, positions: list) -> list:
    """Read bytes at random positions (normal file I/O)."""
    results = []
    with open(path, 'rb') as f:
        for pos in positions:
            f.seek(pos)
            results.append(f.read(100))
    return results
 
def read_mmap(path: str, positions: list) -> list:
    """Read bytes at random positions (mmap)."""
    results = []
    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            for pos in positions:
                results.append(mm[pos:pos + 100])
    return results
 
# mmap is significantly faster for random access

Best Practices

Use ACCESS_READ when possible: Prevents accidental writes
Watch memory pressure: OS manages paging, but be aware
Align access to page boundaries: Better performance
Close mappings properly: Use context managers
Consider alternatives: For sequential access, normal I/O may be simpler

Memory mapping is powerful for random access to large files, searching, and inter-process communication. Use it when you need to work with files that don't fit in RAM.

React to this post:

#Basic Memory Mapping

#Searching Large Files

#Line-by-Line Processing

#Binary Data Processing

#Modifying Files In-Place

#Partial Mapping

#Anonymous Memory Mapping

#Inter-Process Communication

#Memory-Mapped Numpy Arrays

#Regex on Large Files

#mmap Access Modes

#Performance Comparison

#Best Practices

Need help shipping fast?