📁 Processing Large Files

When working with large files that don't fit in memory, you need efficient techniques to process data without overwhelming your system. Python provides several approaches for handling big datasets chunk by chunk.

# Example of memory-efficient file processing
def process_large_text_file(filename):
    """Process file line by line to save memory"""
    try:
        line_count = 0
        word_count = 0
        
        with open(filename, 'r', encoding='utf-8') as file:
            for line in file:
                line_count += 1
                word_count += len(line.split())
                
                # Process each line individually
                if line_count % 10000 == 0:
                    print(f"Processed {line_count:,} lines...")
        
        return {
            'lines': line_count,
            'words': word_count
        }
    
    except FileNotFoundError:
        print(f"File {filename} not found")
        return None

# Simulate processing a large file
# For demo, we'll create sample data
def create_sample_file(filename, num_lines=50000):
    """Create a sample file for testing"""
    with open(filename, 'w', encoding='utf-8') as file:
        for i in range(num_lines):
            file.write(f"Line {i+1}: This is sample text with multiple words.\n")
    print(f"Created sample file with {num_lines:,} lines")

# Create and process sample file
sample_file = 'large_sample.txt'
create_sample_file(sample_file, 50000)
result = process_large_text_file(sample_file)

if result:
    print(f"File processed: {result['lines']:,} lines, {result['words']:,} words")

🎯 Understanding Large File Processing

Large files require special handling to avoid memory issues and ensure efficient processing.

Reading Files in Chunks

def read_file_in_chunks(filename, chunk_size=8192):
    """Read file in fixed-size chunks"""
    try:
        total_size = 0
        chunk_count = 0
        
        with open(filename, 'rb') as file:
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                
                chunk_count += 1
                total_size += len(chunk)
                
                # Process chunk here
                # For demo, we'll just count
                if chunk_count % 100 == 0:
                    print(f"Read {chunk_count} chunks, {total_size:,} bytes")
        
        return {
            'chunks': chunk_count,
            'total_bytes': total_size
        }
    
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def process_text_chunks(filename, chunk_size=1024*1024):  # 1MB chunks
    """Process text file in chunks with line boundary handling"""
    buffer = ""
    line_count = 0
    
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    # Process any remaining data in buffer
                    if buffer:
                        lines = buffer.split('\n')
                        line_count += len([line for line in lines if line.strip()])
                    break
                
                # Add chunk to buffer
                buffer += chunk
                
                # Split on newlines, keeping last incomplete line
                lines = buffer.split('\n')
                buffer = lines[-1]  # Keep incomplete line
                complete_lines = lines[:-1]
                
                # Process complete lines
                for line in complete_lines:
                    if line.strip():  # Skip empty lines
                        line_count += 1
                        # Process line here
                
                if line_count % 10000 == 0:
                    print(f"Processed {line_count:,} lines...")
        
        return line_count
    
    except Exception as e:
        print(f"Error processing file: {e}")
        return 0

# Test chunk reading
result = read_file_in_chunks(sample_file)
if result:
    print(f"File read: {result['chunks']} chunks, {result['total_bytes']:,} bytes")

# Test text chunk processing
line_count = process_text_chunks(sample_file)
print(f"Text processing: {line_count:,} lines processed")

CSV File Processing

import csv
from collections import defaultdict

def process_large_csv(filename, process_batch_size=1000):
    """Process large CSV file in batches"""
    try:
        batch = []
        batch_count = 0
        total_rows = 0
        column_stats = defaultdict(int)
        
        with open(filename, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            
            for row in reader:
                batch.append(row)
                total_rows += 1
                
                # Count non-empty values per column
                for column, value in row.items():
                    if value and value.strip():
                        column_stats[column] += 1
                
                # Process batch when it reaches target size
                if len(batch) >= process_batch_size:
                    batch_count += 1
                    process_csv_batch(batch, batch_count)
                    batch = []  # Clear batch
                    
                    if total_rows % 10000 == 0:
                        print(f"Processed {total_rows:,} rows...")
            
            # Process remaining batch
            if batch:
                batch_count += 1
                process_csv_batch(batch, batch_count)
        
        return {
            'total_rows': total_rows,
            'batches': batch_count,
            'column_stats': dict(column_stats)
        }
    
    except Exception as e:
        print(f"Error processing CSV: {e}")
        return None

def process_csv_batch(batch, batch_number):
    """Process a batch of CSV rows"""
    # Example processing: calculate averages, find patterns, etc.
    
    # For demo, we'll just count rows with specific criteria
    valid_rows = [row for row in batch if all(row.values())]
    
    print(f"Batch {batch_number}: {len(batch)} rows, {len(valid_rows)} complete")

def create_sample_csv(filename, num_rows=25000):
    """Create sample CSV for testing"""
    import random
    
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write header
        writer.writerow(['id', 'name', 'age', 'city', 'score'])
        
        cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
        names = ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace']
        
        for i in range(num_rows):
            writer.writerow([
                i + 1,
                random.choice(names) + str(random.randint(1, 100)),
                random.randint(18, 80),
                random.choice(cities),
                round(random.uniform(0, 100), 2)
            ])
    
    print(f"Created sample CSV with {num_rows:,} rows")

# Create and process sample CSV
csv_file = 'large_sample.csv'
create_sample_csv(csv_file, 25000)

csv_result = process_large_csv(csv_file)
if csv_result:
    print(f"CSV processed: {csv_result['total_rows']:,} rows in {csv_result['batches']} batches")
    print("Column completeness:")
    for column, count in csv_result['column_stats'].items():
        percentage = (count / csv_result['total_rows']) * 100
        print(f"  {column}: {percentage:.1f}% complete")

📋 Large File Processing Techniques

Technique	Best For	Memory Usage
Line-by-line	Text files, logs	Very low
Fixed chunks	Binary files, any format	Low
Batch processing	CSV, structured data	Medium
Streaming	Real-time data	Very low
Memory mapping	Random access needed	Variable

🎯 Key Takeaways

🚀 You've Completed Data Processing!

Congratulations! You've learned essential data processing techniques including regex, XML/HTML parsing, API requests, databases, and large file handling. These skills enable you to work with diverse data sources and build robust data processing applications.

What you've mastered:

Pattern matching and text extraction with regular expressions
Parsing structured data from XML and HTML documents
Making HTTP requests and processing API responses
Working with SQL databases for data storage and retrieval
Processing large files efficiently without memory issues

Next section: Best Practices - Learn to write clean, maintainable, and efficient Python code.

Online Python