💾 File Operations
Data persistence is essential for real-world workflows! NumPy's file operations let you save analysis results, share datasets, and create data pipelines by efficiently storing and loading arrays in various formats.
import numpy as np
# Create sample analysis data
analysis_results = {
'sales_data': np.array([[1200, 1350, 1180], [1420, 1290, 1380]]),
'customer_scores': np.array([8.5, 7.2, 9.1, 6.8, 8.9]),
'monthly_growth': np.array([0.12, 0.15, 0.08, 0.22, 0.18])
}
print("📊 Analysis Results Ready for Storage:")
for name, data in analysis_results.items():
print(f"{name}: shape {data.shape}, mean {np.mean(data):.2f}")
# File operations preview
print(f"\n💾 File I/O Capabilities:")
print(f"✓ Binary format (.npy) - Single arrays")
print(f"✓ Archive format (.npz) - Multiple arrays")
print(f"✓ Text format (.txt/.csv) - Human readable")
print(f"✓ Compressed storage - Save space")
🎯 Why File Operations Matter
File I/O enables essential data workflows:
- Save Analysis Results 💾: Preserve calculations and processed data
- Share Datasets 🤝: Exchange data between teams and systems
- Create Pipelines 🔄: Build multi-step data processing workflows
- Backup Critical Data 🛡️: Protect against data loss
📁 File Format Overview
NumPy supports multiple file formats for different needs:
Binary Formats (Efficient)
import numpy as np
# Sample data for format comparison
data = np.random.random((1000, 50)) # 50,000 numbers
print(f"Array info:")
print(f"Shape: {data.shape}")
print(f"Data type: {data.dtype}")
print(f"Memory size: {data.nbytes / 1024:.1f} KB")
# Binary formats preserve exact values and are fast
print(f"\n💾 Binary Formats:")
print(f"✓ .npy - Single array, fast loading")
print(f"✓ .npz - Multiple arrays in one file")
print(f"✓ Compressed .npz - Space-efficient storage")
print(f"✓ Perfect precision - No data loss")
Text Formats (Human-Readable)
import numpy as np
# Small dataset for text format demo
sales_summary = np.array([[120, 135, 145],
[98, 112, 125],
[156, 167, 175]])
quarters = ['Q1', 'Q2', 'Q3']
regions = ['North', 'South', 'East']
print(f"Sales Summary (text-friendly):")
print(sales_summary)
print(f"\n📄 Text Formats:")
print(f"✓ .txt - Space/tab separated")
print(f"✓ .csv - Comma separated (Excel compatible)")
print(f"✓ Human readable and editable")
print(f"✓ Compatible with other tools")
print(f"⚠️ May lose precision with floats")
🚀 Quick Start Examples
Save and Load Single Array
import numpy as np
# Create analysis data
monthly_sales = np.array([12500, 13200, 11800, 14500, 13900, 12100])
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
print(f"Monthly sales: {monthly_sales}")
print(f"Total sales: ${np.sum(monthly_sales):,}")
print(f"Average: ${np.mean(monthly_sales):,.0f}")
# Save to binary format (most efficient)
# np.save('monthly_sales.npy', monthly_sales)
print(f"\n💾 Saved as 'monthly_sales.npy'")
# Load back (exact copy)
# loaded_sales = np.load('monthly_sales.npy')
# print(f"Loaded sales: {loaded_sales}")
# print(f"Data identical: {np.array_equal(monthly_sales, loaded_sales)}")
# Note: File operations commented for demo - files would be created in real use
Save Multiple Related Arrays
import numpy as np
# Related analysis datasets
customer_data = {
'ids': np.array([1001, 1002, 1003, 1004, 1005]),
'ages': np.array([25, 34, 28, 45, 31]),
'purchases': np.array([250.50, 180.25, 420.75, 95.00, 310.50]),
'ratings': np.array([4.5, 3.8, 4.9, 3.2, 4.1])
}
print(f"Customer Dataset:")
for name, data in customer_data.items():
print(f"{name}: {data}")
# Save all arrays in one file
# np.savez('customer_analysis.npz', **customer_data)
print(f"\n💾 All data saved in 'customer_analysis.npz'")
# Load all arrays back
# loaded_data = np.load('customer_analysis.npz')
# print(f"Available arrays: {list(loaded_data.keys())}")
# print(f"Customer IDs: {loaded_data['ids']}")
# Compressed version for large datasets
# np.savez_compressed('customer_analysis_compressed.npz', **customer_data)
print(f"💾 Compressed version: 'customer_analysis_compressed.npz'")
📊 Format Comparison
Choose the right format for your needs:
Format | Best For | Pros | Cons |
---|---|---|---|
.npy | Single arrays, speed | Fast, exact precision | Binary, one array only |
.npz | Multiple arrays | Multiple arrays, fast | Binary format |
.npz compressed | Large datasets | Space efficient | Slower load/save |
.txt/.csv | Sharing, inspection | Human readable, universal | Larger files, precision loss |
Performance and Size Comparison
import numpy as np
import tempfile
import os
# Create test data
test_data = np.random.random((1000, 10)) # 10,000 float64 numbers
print(f"Test data: {test_data.shape} array of {test_data.dtype}")
print(f"Memory size: {test_data.nbytes / 1024:.1f} KB")
# Simulate file size comparison (actual files not created in demo)
print(f"\n📁 Estimated File Sizes:")
print(f".npy binary: ~{test_data.nbytes / 1024:.1f} KB")
print(f".npz archive: ~{test_data.nbytes / 1024 * 1.1:.1f} KB")
print(f".npz compressed: ~{test_data.nbytes / 1024 * 0.4:.1f} KB")
print(f".txt format: ~{test_data.size * 20 / 1024:.1f} KB") # Estimate
print(f"\n⚡ Performance Characteristics:")
print(f"Fastest: .npy (direct binary)")
print(f"Most compact: .npz compressed")
print(f"Most compatible: .txt/.csv")
print(f"Best for multiple arrays: .npz")
🔄 Data Workflow Example
Typical analysis workflow with file operations:
import numpy as np
# Step 1: Generate/process data
print("🔄 Data Analysis Workflow:")
print("Step 1: Data Processing")
raw_data = np.random.normal(100, 15, 1000)
processed_data = raw_data[raw_data > 80] # Filter outliers
statistics = {
'mean': np.mean(processed_data),
'std': np.std(processed_data),
'count': len(processed_data)
}
print(f"Raw data: {len(raw_data)} points")
print(f"Processed: {len(processed_data)} points")
print(f"Statistics: mean={statistics['mean']:.1f}, std={statistics['std']:.1f}")
# Step 2: Save intermediate results
print(f"\nStep 2: Save Intermediate Results")
# np.save('processed_data.npy', processed_data)
# np.savez('analysis_results.npz',
# raw=raw_data,
# processed=processed_data,
# stats=np.array([statistics['mean'], statistics['std'], statistics['count']]))
print(f"💾 Saved: processed_data.npy, analysis_results.npz")
# Step 3: Later analysis session
print(f"\nStep 3: Continue Analysis (New Session)")
# loaded_processed = np.load('processed_data.npy')
# analysis_bundle = np.load('analysis_results.npz')
print(f"📂 Loaded previous analysis")
print(f"✓ Can continue where we left off")
print(f"✓ No need to reprocess raw data")
print(f"✓ Share results with team")
🛠️ Advanced Use Cases
Data Pipeline Integration
import numpy as np
# Simulate data pipeline stages
print("🏭 Data Pipeline Example:")
# Stage 1: Data collection
sensor_readings = np.random.normal(25, 3, (24, 10)) # 24 hours, 10 sensors
timestamps = np.arange(24)
print(f"Stage 1 - Data Collection:")
print(f"Collected {sensor_readings.shape} readings")
# Stage 2: Quality control
valid_readings = (sensor_readings > 15) & (sensor_readings < 35)
quality_score = np.mean(valid_readings, axis=1)
print(f"Stage 2 - Quality Control:")
print(f"Average quality score: {np.mean(quality_score):.1%}")
# Stage 3: Analysis
hourly_averages = np.mean(sensor_readings, axis=1)
daily_trend = np.gradient(hourly_averages)
print(f"Stage 3 - Analysis:")
print(f"Temperature trend: {np.mean(daily_trend):.3f}°C/hour")
# Save pipeline results for next stage
pipeline_output = {
'raw_readings': sensor_readings,
'quality_scores': quality_score,
'hourly_averages': hourly_averages,
'trend_analysis': daily_trend,
'timestamps': timestamps
}
print(f"\n💾 Pipeline Output Ready:")
print(f"✓ {len(pipeline_output)} datasets prepared")
print(f"✓ Ready for visualization team")
print(f"✓ Ready for reporting system")
# np.savez('daily_sensor_analysis.npz', **pipeline_output)
Backup and Versioning
import numpy as np
from datetime import datetime
# Simulation of analysis versioning
print("🗂️ Analysis Versioning System:")
# Original analysis
analysis_v1 = {
'sales_data': np.array([1200, 1350, 1180, 1420]),
'growth_rate': np.array([0.12, 0.15, -0.05, 0.20]),
'version': '1.0',
'date': '2024-01-15'
}
print(f"Version 1.0: Initial analysis")
print(f"Sales data: {analysis_v1['sales_data']}")
# Updated analysis with new data
analysis_v2 = {
'sales_data': np.array([1200, 1350, 1180, 1420, 1290, 1380]), # Added data
'growth_rate': np.array([0.12, 0.15, -0.05, 0.20, -0.09, 0.07]),
'forecast': np.array([1450, 1500, 1520]), # New feature
'version': '2.0',
'date': '2024-01-22'
}
print(f"\nVersion 2.0: Updated with new data")
print(f"Added {len(analysis_v2['sales_data']) - len(analysis_v1['sales_data'])} new data points")
print(f"New feature: 3-month forecast")
# Save with version information
# np.savez(f'sales_analysis_v1.0_2024-01-15.npz', **analysis_v1)
# np.savez(f'sales_analysis_v2.0_2024-01-22.npz', **analysis_v2)
print(f"\n💾 Versioned Backups:")
print(f"✓ sales_analysis_v1.0_2024-01-15.npz")
print(f"✓ sales_analysis_v2.0_2024-01-22.npz")
print(f"✓ Can compare versions")
print(f"✓ Can rollback if needed")
📚 What You'll Learn
Master data persistence with NumPy file operations:
- 💾 Saving and Loading Arrays - Complete guide to all NumPy file formats and best practices
🎯 Key Benefits
🚀 Ready for File Operations?
Master data persistence and sharing with NumPy's comprehensive file I/O capabilities!
Learn More: Saving and Loading Arrays
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.