💾 File Operations

Data persistence is essential for real-world workflows! NumPy's file operations let you save analysis results, share datasets, and create data pipelines by efficiently storing and loading arrays in various formats.

import numpy as np

# Create sample analysis data
analysis_results = {
    'sales_data': np.array([[1200, 1350, 1180], [1420, 1290, 1380]]),
    'customer_scores': np.array([8.5, 7.2, 9.1, 6.8, 8.9]),
    'monthly_growth': np.array([0.12, 0.15, 0.08, 0.22, 0.18])
}

print("📊 Analysis Results Ready for Storage:")
for name, data in analysis_results.items():
    print(f"{name}: shape {data.shape}, mean {np.mean(data):.2f}")

# File operations preview
print(f"\n💾 File I/O Capabilities:")
print(f"✓ Binary format (.npy) - Single arrays")
print(f"✓ Archive format (.npz) - Multiple arrays") 
print(f"✓ Text format (.txt/.csv) - Human readable")
print(f"✓ Compressed storage - Save space")

🎯 Why File Operations Matter

File I/O enables essential data workflows:

Save Analysis Results 💾: Preserve calculations and processed data
Share Datasets 🤝: Exchange data between teams and systems
Create Pipelines 🔄: Build multi-step data processing workflows
Backup Critical Data 🛡️: Protect against data loss

📁 File Format Overview

NumPy supports multiple file formats for different needs:

Binary Formats (Efficient)

import numpy as np

# Sample data for format comparison
data = np.random.random((1000, 50))  # 50,000 numbers

print(f"Array info:")
print(f"Shape: {data.shape}")
print(f"Data type: {data.dtype}")
print(f"Memory size: {data.nbytes / 1024:.1f} KB")

# Binary formats preserve exact values and are fast
print(f"\n💾 Binary Formats:")
print(f"✓ .npy - Single array, fast loading")
print(f"✓ .npz - Multiple arrays in one file")
print(f"✓ Compressed .npz - Space-efficient storage")
print(f"✓ Perfect precision - No data loss")

Text Formats (Human-Readable)

import numpy as np

# Small dataset for text format demo
sales_summary = np.array([[120, 135, 145],
                         [98, 112, 125],
                         [156, 167, 175]])

quarters = ['Q1', 'Q2', 'Q3']
regions = ['North', 'South', 'East']

print(f"Sales Summary (text-friendly):")
print(sales_summary)

print(f"\n📄 Text Formats:")
print(f"✓ .txt - Space/tab separated")
print(f"✓ .csv - Comma separated (Excel compatible)")
print(f"✓ Human readable and editable")
print(f"✓ Compatible with other tools")
print(f"⚠️  May lose precision with floats")

🚀 Quick Start Examples

Save and Load Single Array

import numpy as np

# Create analysis data
monthly_sales = np.array([12500, 13200, 11800, 14500, 13900, 12100])
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

print(f"Monthly sales: {monthly_sales}")
print(f"Total sales: ${np.sum(monthly_sales):,}")
print(f"Average: ${np.mean(monthly_sales):,.0f}")

# Save to binary format (most efficient)
# np.save('monthly_sales.npy', monthly_sales)
print(f"\n💾 Saved as 'monthly_sales.npy'")

# Load back (exact copy)
# loaded_sales = np.load('monthly_sales.npy')
# print(f"Loaded sales: {loaded_sales}")
# print(f"Data identical: {np.array_equal(monthly_sales, loaded_sales)}")

# Note: File operations commented for demo - files would be created in real use

import numpy as np

# Related analysis datasets
customer_data = {
    'ids': np.array([1001, 1002, 1003, 1004, 1005]),
    'ages': np.array([25, 34, 28, 45, 31]),
    'purchases': np.array([250.50, 180.25, 420.75, 95.00, 310.50]),
    'ratings': np.array([4.5, 3.8, 4.9, 3.2, 4.1])
}

print(f"Customer Dataset:")
for name, data in customer_data.items():
    print(f"{name}: {data}")

# Save all arrays in one file
# np.savez('customer_analysis.npz', **customer_data)
print(f"\n💾 All data saved in 'customer_analysis.npz'")

# Load all arrays back
# loaded_data = np.load('customer_analysis.npz')
# print(f"Available arrays: {list(loaded_data.keys())}")
# print(f"Customer IDs: {loaded_data['ids']}")

# Compressed version for large datasets
# np.savez_compressed('customer_analysis_compressed.npz', **customer_data)
print(f"💾 Compressed version: 'customer_analysis_compressed.npz'")

📊 Format Comparison

Choose the right format for your needs:

Format	Best For	Pros	Cons
`.npy`	Single arrays, speed	Fast, exact precision	Binary, one array only
`.npz`	Multiple arrays	Multiple arrays, fast	Binary format
`.npz` compressed	Large datasets	Space efficient	Slower load/save
`.txt/.csv`	Sharing, inspection	Human readable, universal	Larger files, precision loss

Performance and Size Comparison

import numpy as np
import tempfile
import os

# Create test data
test_data = np.random.random((1000, 10))  # 10,000 float64 numbers

print(f"Test data: {test_data.shape} array of {test_data.dtype}")
print(f"Memory size: {test_data.nbytes / 1024:.1f} KB")

# Simulate file size comparison (actual files not created in demo)
print(f"\n📁 Estimated File Sizes:")
print(f".npy binary:      ~{test_data.nbytes / 1024:.1f} KB")
print(f".npz archive:     ~{test_data.nbytes / 1024 * 1.1:.1f} KB")  
print(f".npz compressed:  ~{test_data.nbytes / 1024 * 0.4:.1f} KB")
print(f".txt format:      ~{test_data.size * 20 / 1024:.1f} KB")  # Estimate

print(f"\n⚡ Performance Characteristics:")
print(f"Fastest: .npy (direct binary)")
print(f"Most compact: .npz compressed") 
print(f"Most compatible: .txt/.csv")
print(f"Best for multiple arrays: .npz")

🔄 Data Workflow Example

Typical analysis workflow with file operations:

import numpy as np

# Step 1: Generate/process data
print("🔄 Data Analysis Workflow:")
print("Step 1: Data Processing")

raw_data = np.random.normal(100, 15, 1000)
processed_data = raw_data[raw_data > 80]  # Filter outliers
statistics = {
    'mean': np.mean(processed_data),
    'std': np.std(processed_data),
    'count': len(processed_data)
}

print(f"Raw data: {len(raw_data)} points")
print(f"Processed: {len(processed_data)} points") 
print(f"Statistics: mean={statistics['mean']:.1f}, std={statistics['std']:.1f}")

# Step 2: Save intermediate results
print(f"\nStep 2: Save Intermediate Results")
# np.save('processed_data.npy', processed_data)
# np.savez('analysis_results.npz', 
#          raw=raw_data, 
#          processed=processed_data,
#          stats=np.array([statistics['mean'], statistics['std'], statistics['count']]))
print(f"💾 Saved: processed_data.npy, analysis_results.npz")

# Step 3: Later analysis session
print(f"\nStep 3: Continue Analysis (New Session)")
# loaded_processed = np.load('processed_data.npy')
# analysis_bundle = np.load('analysis_results.npz')
print(f"📂 Loaded previous analysis")
print(f"✓ Can continue where we left off")
print(f"✓ No need to reprocess raw data")
print(f"✓ Share results with team")

🛠️ Advanced Use Cases

Data Pipeline Integration

import numpy as np

# Simulate data pipeline stages
print("🏭 Data Pipeline Example:")

# Stage 1: Data collection
sensor_readings = np.random.normal(25, 3, (24, 10))  # 24 hours, 10 sensors
timestamps = np.arange(24)

print(f"Stage 1 - Data Collection:")
print(f"Collected {sensor_readings.shape} readings")

# Stage 2: Quality control
valid_readings = (sensor_readings > 15) & (sensor_readings < 35)
quality_score = np.mean(valid_readings, axis=1)

print(f"Stage 2 - Quality Control:")
print(f"Average quality score: {np.mean(quality_score):.1%}")

# Stage 3: Analysis
hourly_averages = np.mean(sensor_readings, axis=1)
daily_trend = np.gradient(hourly_averages)

print(f"Stage 3 - Analysis:")
print(f"Temperature trend: {np.mean(daily_trend):.3f}°C/hour")

# Save pipeline results for next stage
pipeline_output = {
    'raw_readings': sensor_readings,
    'quality_scores': quality_score,
    'hourly_averages': hourly_averages,
    'trend_analysis': daily_trend,
    'timestamps': timestamps
}

print(f"\n💾 Pipeline Output Ready:")
print(f"✓ {len(pipeline_output)} datasets prepared")
print(f"✓ Ready for visualization team")
print(f"✓ Ready for reporting system")
# np.savez('daily_sensor_analysis.npz', **pipeline_output)

Backup and Versioning

import numpy as np
from datetime import datetime

# Simulation of analysis versioning
print("🗂️ Analysis Versioning System:")

# Original analysis
analysis_v1 = {
    'sales_data': np.array([1200, 1350, 1180, 1420]),
    'growth_rate': np.array([0.12, 0.15, -0.05, 0.20]),
    'version': '1.0',
    'date': '2024-01-15'
}

print(f"Version 1.0: Initial analysis")
print(f"Sales data: {analysis_v1['sales_data']}")

# Updated analysis with new data
analysis_v2 = {
    'sales_data': np.array([1200, 1350, 1180, 1420, 1290, 1380]),  # Added data
    'growth_rate': np.array([0.12, 0.15, -0.05, 0.20, -0.09, 0.07]),
    'forecast': np.array([1450, 1500, 1520]),  # New feature
    'version': '2.0',
    'date': '2024-01-22'
}

print(f"\nVersion 2.0: Updated with new data")
print(f"Added {len(analysis_v2['sales_data']) - len(analysis_v1['sales_data'])} new data points")
print(f"New feature: 3-month forecast")

# Save with version information
# np.savez(f'sales_analysis_v1.0_2024-01-15.npz', **analysis_v1)
# np.savez(f'sales_analysis_v2.0_2024-01-22.npz', **analysis_v2)

print(f"\n💾 Versioned Backups:")
print(f"✓ sales_analysis_v1.0_2024-01-15.npz")
print(f"✓ sales_analysis_v2.0_2024-01-22.npz")
print(f"✓ Can compare versions")
print(f"✓ Can rollback if needed")

📚 What You'll Learn

Master data persistence with NumPy file operations:

💾 Saving and Loading Arrays - Complete guide to all NumPy file formats and best practices

🎯 Key Benefits

🚀 Ready for File Operations?

Master data persistence and sharing with NumPy's comprehensive file I/O capabilities!

Learn More: Saving and Loading Arrays

Online Python

💾 File Operations

Track Your Learning Progress