Generators let you iterate without loading everything into memory. Here's how they work.
The Problem
# Loads all data into memory at once
def get_all_numbers(n):
result = []
for i in range(n):
result.append(i)
return result
numbers = get_all_numbers(10_000_000) # Uses ~400MBThe Solution: Generators
# Generates values on demand
def get_numbers(n):
for i in range(n):
yield i
numbers = get_numbers(10_000_000) # Uses almost no memoryThe yield keyword makes this a generator. It pauses and returns a value, resuming when you ask for the next one.
Using Generators
gen = get_numbers(5)
print(next(gen)) # 0
print(next(gen)) # 1
print(next(gen)) # 2
# Or iterate
for num in get_numbers(5):
print(num)Generator Expressions
Like list comprehensions, but lazy:
# List comprehension - creates list in memory
squares = [x**2 for x in range(1000000)]
# Generator expression - creates values on demand
squares = (x**2 for x in range(1000000))Note the parentheses instead of brackets.
When to Use Generators
Large Data
def read_large_file(path):
with open(path) as f:
for line in f:
yield line.strip()
# Process line by line, never load whole file
for line in read_large_file("huge.csv"):
process(line)Infinite Sequences
def count_forever():
n = 0
while True:
yield n
n += 1
# Take only what you need
from itertools import islice
first_100 = list(islice(count_forever(), 100))Pipeline Processing
def read_lines(path):
with open(path) as f:
for line in f:
yield line
def parse_json(lines):
for line in lines:
yield json.loads(line)
def filter_active(records):
for record in records:
if record.get("active"):
yield record
# Chain them together
pipeline = filter_active(parse_json(read_lines("data.jsonl")))
for record in pipeline:
process(record)The Iterator Protocol
Generators implement the iterator protocol:
class Counter:
def __init__(self, max_count):
self.max = max_count
self.current = 0
def __iter__(self):
return self
def __next__(self):
if self.current >= self.max:
raise StopIteration
self.current += 1
return self.current - 1
for num in Counter(5):
print(num) # 0, 1, 2, 3, 4Generators are simpler:
def counter(max_count):
current = 0
while current < max_count:
yield current
current += 1yield from
Delegate to another generator:
def flatten(nested):
for item in nested:
if isinstance(item, list):
yield from flatten(item) # Delegate
else:
yield item
list(flatten([1, [2, 3], [4, [5, 6]]]))
# [1, 2, 3, 4, 5, 6]Sending Values
Generators can receive values:
def accumulator():
total = 0
while True:
value = yield total
if value is not None:
total += value
acc = accumulator()
next(acc) # Start the generator, returns 0
acc.send(10) # Returns 10
acc.send(5) # Returns 15
acc.send(25) # Returns 40Common Patterns
Batch Processing
def batches(iterable, size):
batch = []
for item in iterable:
batch.append(item)
if len(batch) == size:
yield batch
batch = []
if batch:
yield batch
for batch in batches(range(10), 3):
print(batch)
# [0, 1, 2]
# [3, 4, 5]
# [6, 7, 8]
# [9]Sliding Window
from collections import deque
def sliding_window(iterable, size):
it = iter(iterable)
window = deque(maxlen=size)
for _ in range(size):
window.append(next(it))
yield tuple(window)
for item in it:
window.append(item)
yield tuple(window)
list(sliding_window([1, 2, 3, 4, 5], 3))
# [(1, 2, 3), (2, 3, 4), (3, 4, 5)]itertools
The standard library has powerful generator tools:
from itertools import (
count, # Infinite counter
cycle, # Infinite repetition
repeat, # Repeat value
chain, # Concatenate iterables
islice, # Slice an iterator
takewhile, # Take while condition
dropwhile, # Drop while condition
groupby, # Group consecutive items
)
# Examples
list(islice(count(10), 5)) # [10, 11, 12, 13, 14]
list(islice(cycle([1, 2]), 5)) # [1, 2, 1, 2, 1]
list(chain([1, 2], [3, 4])) # [1, 2, 3, 4]Memory Comparison
import sys
# List - stores all values
list_nums = [i for i in range(1000)]
print(sys.getsizeof(list_nums)) # ~8856 bytes
# Generator - stores only the function
gen_nums = (i for i in range(1000))
print(sys.getsizeof(gen_nums)) # ~112 bytesGotchas
One-Time Use
gen = (x for x in range(5))
list(gen) # [0, 1, 2, 3, 4]
list(gen) # [] - exhausted!No Length
gen = (x for x in range(5))
len(gen) # TypeErrorNo Indexing
gen = (x for x in range(5))
gen[0] # TypeErrorWhen Not to Use
- When you need to access items multiple times
- When you need random access
- When you need to know the length upfront
- When the data easily fits in memory
Use generators for streaming, pipelines, and memory efficiency.
React to this post: