📁 Processing Large Files
When working with large files that don't fit in memory, you need efficient techniques to process data without overwhelming your system. Python provides several approaches for handling big datasets chunk by chunk.
# Example of memory-efficient file processing
def process_large_text_file(filename):
"""Process file line by line to save memory"""
try:
line_count = 0
word_count = 0
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
line_count += 1
word_count += len(line.split())
# Process each line individually
if line_count % 10000 == 0:
print(f"Processed {line_count:,} lines...")
return {
'lines': line_count,
'words': word_count
}
except FileNotFoundError:
print(f"File {filename} not found")
return None
# Simulate processing a large file
# For demo, we'll create sample data
def create_sample_file(filename, num_lines=50000):
"""Create a sample file for testing"""
with open(filename, 'w', encoding='utf-8') as file:
for i in range(num_lines):
file.write(f"Line {i+1}: This is sample text with multiple words.\n")
print(f"Created sample file with {num_lines:,} lines")
# Create and process sample file
sample_file = 'large_sample.txt'
create_sample_file(sample_file, 50000)
result = process_large_text_file(sample_file)
if result:
print(f"File processed: {result['lines']:,} lines, {result['words']:,} words")
🎯 Understanding Large File Processing
Large files require special handling to avoid memory issues and ensure efficient processing.
Reading Files in Chunks
def read_file_in_chunks(filename, chunk_size=8192):
"""Read file in fixed-size chunks"""
try:
total_size = 0
chunk_count = 0
with open(filename, 'rb') as file:
while True:
chunk = file.read(chunk_size)
if not chunk:
break
chunk_count += 1
total_size += len(chunk)
# Process chunk here
# For demo, we'll just count
if chunk_count % 100 == 0:
print(f"Read {chunk_count} chunks, {total_size:,} bytes")
return {
'chunks': chunk_count,
'total_bytes': total_size
}
except Exception as e:
print(f"Error reading file: {e}")
return None
def process_text_chunks(filename, chunk_size=1024*1024): # 1MB chunks
"""Process text file in chunks with line boundary handling"""
buffer = ""
line_count = 0
try:
with open(filename, 'r', encoding='utf-8') as file:
while True:
chunk = file.read(chunk_size)
if not chunk:
# Process any remaining data in buffer
if buffer:
lines = buffer.split('\n')
line_count += len([line for line in lines if line.strip()])
break
# Add chunk to buffer
buffer += chunk
# Split on newlines, keeping last incomplete line
lines = buffer.split('\n')
buffer = lines[-1] # Keep incomplete line
complete_lines = lines[:-1]
# Process complete lines
for line in complete_lines:
if line.strip(): # Skip empty lines
line_count += 1
# Process line here
if line_count % 10000 == 0:
print(f"Processed {line_count:,} lines...")
return line_count
except Exception as e:
print(f"Error processing file: {e}")
return 0
# Test chunk reading
result = read_file_in_chunks(sample_file)
if result:
print(f"File read: {result['chunks']} chunks, {result['total_bytes']:,} bytes")
# Test text chunk processing
line_count = process_text_chunks(sample_file)
print(f"Text processing: {line_count:,} lines processed")
CSV File Processing
import csv
from collections import defaultdict
def process_large_csv(filename, process_batch_size=1000):
"""Process large CSV file in batches"""
try:
batch = []
batch_count = 0
total_rows = 0
column_stats = defaultdict(int)
with open(filename, 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
batch.append(row)
total_rows += 1
# Count non-empty values per column
for column, value in row.items():
if value and value.strip():
column_stats[column] += 1
# Process batch when it reaches target size
if len(batch) >= process_batch_size:
batch_count += 1
process_csv_batch(batch, batch_count)
batch = [] # Clear batch
if total_rows % 10000 == 0:
print(f"Processed {total_rows:,} rows...")
# Process remaining batch
if batch:
batch_count += 1
process_csv_batch(batch, batch_count)
return {
'total_rows': total_rows,
'batches': batch_count,
'column_stats': dict(column_stats)
}
except Exception as e:
print(f"Error processing CSV: {e}")
return None
def process_csv_batch(batch, batch_number):
"""Process a batch of CSV rows"""
# Example processing: calculate averages, find patterns, etc.
# For demo, we'll just count rows with specific criteria
valid_rows = [row for row in batch if all(row.values())]
print(f"Batch {batch_number}: {len(batch)} rows, {len(valid_rows)} complete")
def create_sample_csv(filename, num_rows=25000):
"""Create sample CSV for testing"""
import random
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write header
writer.writerow(['id', 'name', 'age', 'city', 'score'])
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
names = ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace']
for i in range(num_rows):
writer.writerow([
i + 1,
random.choice(names) + str(random.randint(1, 100)),
random.randint(18, 80),
random.choice(cities),
round(random.uniform(0, 100), 2)
])
print(f"Created sample CSV with {num_rows:,} rows")
# Create and process sample CSV
csv_file = 'large_sample.csv'
create_sample_csv(csv_file, 25000)
csv_result = process_large_csv(csv_file)
if csv_result:
print(f"CSV processed: {csv_result['total_rows']:,} rows in {csv_result['batches']} batches")
print("Column completeness:")
for column, count in csv_result['column_stats'].items():
percentage = (count / csv_result['total_rows']) * 100
print(f" {column}: {percentage:.1f}% complete")
📋 Large File Processing Techniques
Technique | Best For | Memory Usage |
---|---|---|
Line-by-line | Text files, logs | Very low |
Fixed chunks | Binary files, any format | Low |
Batch processing | CSV, structured data | Medium |
Streaming | Real-time data | Very low |
Memory mapping | Random access needed | Variable |
🎯 Key Takeaways
🚀 You've Completed Data Processing!
Congratulations! You've learned essential data processing techniques including regex, XML/HTML parsing, API requests, databases, and large file handling. These skills enable you to work with diverse data sources and build robust data processing applications.
What you've mastered:
- Pattern matching and text extraction with regular expressions
- Parsing structured data from XML and HTML documents
- Making HTTP requests and processing API responses
- Working with SQL databases for data storage and retrieval
- Processing large files efficiently without memory issues
Next section: Best Practices - Learn to write clean, maintainable, and efficient Python code.
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.