💾 Saving and Loading Arrays

Mastering file operations is essential for real-world data workflows! Learn to save analysis results, share datasets, and build robust data pipelines using NumPy's comprehensive file I/O capabilities.

import numpy as np

# Sample analysis data to save
sales_data = np.array([[1200, 1350, 1180, 1420],
                      [1290, 1380, 1250, 1500],
                      [1100, 1200, 1150, 1320]])

regions = ['North', 'South', 'East']
months = ['Jan', 'Feb', 'Mar', 'Apr']

print(f"Sales data shape: {sales_data.shape}")
print(f"Total sales: ${np.sum(sales_data):,}")
print(f"Average monthly sales: ${np.mean(sales_data):,.0f}")

# This data is ready to be saved in various formats!

💾 Binary Format (.npy)

Save and load single arrays efficiently with perfect precision.

Basic Save and Load

import numpy as np
import tempfile
import os

# Create sample data
monthly_revenue = np.array([45000, 52000, 48000, 61000, 58000, 55000])
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

print(f"Monthly revenue: {monthly_revenue}")
print(f"Data type: {monthly_revenue.dtype}")

# Save to binary file
# Note: In real usage, you'd use actual file paths
temp_dir = tempfile.gettempdir()
file_path = os.path.join(temp_dir, 'monthly_revenue.npy')

# Save array
np.save(file_path, monthly_revenue)
print(f"✅ Saved to: {file_path}")

# Load array back
loaded_revenue = np.load(file_path)
print(f"📂 Loaded: {loaded_revenue}")

# Verify data integrity
print(f"Data identical: {np.array_equal(monthly_revenue, loaded_revenue)}")
print(f"Data type preserved: {loaded_revenue.dtype}")

# Clean up
os.remove(file_path)

Preserving Data Types

import numpy as np
import tempfile
import os

# Different data types
customer_ids = np.array([1001, 1002, 1003, 1004], dtype=np.int32)
ratings = np.array([4.5, 3.8, 4.9, 4.1], dtype=np.float32)
is_premium = np.array([True, False, True, False], dtype=bool)

print(f"Original data types:")
print(f"Customer IDs: {customer_ids.dtype}")
print(f"Ratings: {ratings.dtype}")
print(f"Premium status: {is_premium.dtype}")

temp_dir = tempfile.gettempdir()

# Save each array
np.save(os.path.join(temp_dir, 'customer_ids.npy'), customer_ids)
np.save(os.path.join(temp_dir, 'ratings.npy'), ratings)
np.save(os.path.join(temp_dir, 'is_premium.npy'), is_premium)

# Load back
loaded_ids = np.load(os.path.join(temp_dir, 'customer_ids.npy'))
loaded_ratings = np.load(os.path.join(temp_dir, 'ratings.npy'))
loaded_premium = np.load(os.path.join(temp_dir, 'is_premium.npy'))

print(f"\nLoaded data types:")
print(f"Customer IDs: {loaded_ids.dtype}")
print(f"Ratings: {loaded_ratings.dtype}")
print(f"Premium status: {loaded_premium.dtype}")

# Clean up
for file in ['customer_ids.npy', 'ratings.npy', 'is_premium.npy']:
    os.remove(os.path.join(temp_dir, file))

📦 Archive Format (.npz)

Save multiple related arrays in a single file.

Multiple Arrays

import numpy as np
import tempfile
import os

# Related analysis data
customer_analysis = {
    'customer_ids': np.array([1001, 1002, 1003, 1004, 1005]),
    'ages': np.array([25, 34, 28, 45, 31]),
    'purchase_amounts': np.array([250.50, 180.25, 420.75, 95.00, 310.50]),
    'satisfaction_scores': np.array([4.5, 3.8, 4.9, 3.2, 4.1]),
    'last_purchase_days': np.array([5, 12, 3, 45, 8])
}

print(f"Customer analysis contains {len(customer_analysis)} arrays:")
for name, data in customer_analysis.items():
    print(f"  {name}: {data.shape} {data.dtype}")

temp_dir = tempfile.gettempdir()
file_path = os.path.join(temp_dir, 'customer_analysis.npz')

# Save all arrays in one file
np.savez(file_path, **customer_analysis)
print(f"\n✅ Saved all arrays to: customer_analysis.npz")

# Load the archive
loaded_data = np.load(file_path)
print(f"\n📂 Available arrays: {list(loaded_data.keys())}")

# Access individual arrays
print(f"Customer IDs: {loaded_data['customer_ids']}")
print(f"Average age: {np.mean(loaded_data['ages']):.1f}")
print(f"Total purchases: ${np.sum(loaded_data['purchase_amounts']):.2f}")

# Close and clean up
loaded_data.close()
os.remove(file_path)

Named vs Positional Arguments

import numpy as np
import tempfile
import os

# Sample data
sales_q1 = np.array([12000, 13500, 11800])
sales_q2 = np.array([14200, 12900, 15100])
sales_q3 = np.array([13800, 14500, 13200])

temp_dir = tempfile.gettempdir()

# Method 1: Named arguments (recommended)
file_path1 = os.path.join(temp_dir, 'sales_named.npz')
np.savez(file_path1, q1=sales_q1, q2=sales_q2, q3=sales_q3)

# Method 2: Positional arguments
file_path2 = os.path.join(temp_dir, 'sales_positional.npz')
np.savez(file_path2, sales_q1, sales_q2, sales_q3)

# Load named version
named_data = np.load(file_path1)
print(f"Named arrays: {list(named_data.keys())}")
print(f"Q1 sales: {named_data['q1']}")

# Load positional version
pos_data = np.load(file_path2)
print(f"\nPositional arrays: {list(pos_data.keys())}")
print(f"First array: {pos_data['arr_0']}")  # Auto-named arr_0, arr_1, etc.

# Clean up
named_data.close()
pos_data.close()
os.remove(file_path1)
os.remove(file_path2)

Compressed Archives

import numpy as np
import tempfile
import os

# Large dataset for compression demo
large_data = {
    'sensor_readings': np.random.random((1000, 50)),  # 50,000 numbers
    'timestamps': np.arange(1000),
    'sensor_ids': np.random.randint(100, 999, 50),
    'calibration_factors': np.ones(50) + np.random.random(50) * 0.1
}

print(f"Large dataset:")
total_elements = sum(arr.size for arr in large_data.values())
print(f"Total elements: {total_elements:,}")

temp_dir = tempfile.gettempdir()

# Save uncompressed
uncompressed_path = os.path.join(temp_dir, 'sensor_data.npz')
np.savez(uncompressed_path, **large_data)

# Save compressed
compressed_path = os.path.join(temp_dir, 'sensor_data_compressed.npz')
np.savez_compressed(compressed_path, **large_data)

# Compare file sizes
uncompressed_size = os.path.getsize(uncompressed_path)
compressed_size = os.path.getsize(compressed_path)

print(f"\n📁 File Size Comparison:")
print(f"Uncompressed: {uncompressed_size / 1024:.1f} KB")
print(f"Compressed: {compressed_size / 1024:.1f} KB")
print(f"Space savings: {(1 - compressed_size/uncompressed_size)*100:.1f}%")

# Load compressed data (same interface)
loaded_compressed = np.load(compressed_path)
print(f"\nCompressed data loaded successfully")
print(f"Available arrays: {list(loaded_compressed.keys())}")

# Clean up
loaded_compressed.close()
os.remove(uncompressed_path)
os.remove(compressed_path)

📄 Text Formats

Save human-readable files for sharing and compatibility.

Basic Text Files

import numpy as np
import tempfile
import os

# Simple 2D data
product_sales = np.array([[120, 135, 145],
                         [98, 112, 125],
                         [156, 167, 175]])

regions = ['North', 'South', 'East']
quarters = ['Q1', 'Q2', 'Q3']

print(f"Product sales by region and quarter:")
print(product_sales)

temp_dir = tempfile.gettempdir()
text_path = os.path.join(temp_dir, 'product_sales.txt')

# Save as text (space-separated by default)
np.savetxt(text_path, product_sales, fmt='%d')
print(f"\n✅ Saved as text file")

# Load back
loaded_sales = np.loadtxt(text_path)
print(f"📂 Loaded from text:")
print(loaded_sales)

# Check if data matches (converted to float)
print(f"Data matches: {np.allclose(product_sales, loaded_sales)}")
print(f"Note: Loaded as {loaded_sales.dtype} (text files default to float)")

os.remove(text_path)

CSV Format

import numpy as np
import tempfile
import os

# Customer data suitable for CSV
customer_data = np.array([[1001, 25, 4.5, 250.50],
                         [1002, 34, 3.8, 180.25],
                         [1003, 28, 4.9, 420.75],
                         [1004, 45, 3.2, 95.00]])

columns = ['CustomerID', 'Age', 'Rating', 'Purchase']
print(f"Customer data for CSV export:")
print(f"Columns: {columns}")
print(customer_data)

temp_dir = tempfile.gettempdir()
csv_path = os.path.join(temp_dir, 'customers.csv')

# Save as CSV
np.savetxt(csv_path, customer_data, 
           delimiter=',', 
           fmt=['%d', '%d', '%.1f', '%.2f'],  # Different formats per column
           header='CustomerID,Age,Rating,Purchase',
           comments='')  # Remove # from header

print(f"\n✅ Saved as CSV file")

# Read the text file to show format
with open(csv_path, 'r') as f:
    content = f.read()
print(f"📄 CSV content:")
print(content)

# Load back
loaded_data = np.loadtxt(csv_path, delimiter=',', skiprows=1)
print(f"📂 Loaded data:")
print(loaded_data)

os.remove(csv_path)

Custom Formatting

import numpy as np
import tempfile
import os

# Financial data requiring specific formatting
financial_data = np.array([[1200.456, 0.12345, 45],
                          [1350.789, 0.15678, 52],
                          [1180.123, 0.08901, 38]])

print(f"Financial data (raw):")
print(financial_data)

temp_dir = tempfile.gettempdir()
formatted_path = os.path.join(temp_dir, 'financial_report.txt')

# Save with custom formatting
np.savetxt(formatted_path, financial_data,
           fmt=['$%.2f', '%6.2f%%', '%3d'],  # Currency, percentage, integer
           delimiter='\t',  # Tab separated
           header='Revenue\tGrowth\tEmployees',
           comments='# ')

print(f"\n✅ Saved with custom formatting")

# Show formatted content
with open(formatted_path, 'r') as f:
    content = f.read()
print(f"📄 Formatted content:")
print(content)

# Load with custom format
loaded_financial = np.loadtxt(formatted_path, skiprows=1)
print(f"📂 Loaded data:")
print(loaded_financial)

os.remove(formatted_path)

🔄 Real-World Workflows

Practical examples of file operations in data workflows.

Analysis Pipeline

import numpy as np
import tempfile
import os

print("🔄 Data Analysis Pipeline Example")

# Step 1: Raw data processing
np.random.seed(42)
raw_sensor_data = np.random.normal(25, 3, (24, 10))  # 24 hours, 10 sensors
timestamps = np.arange(24)

print(f"Step 1: Processing raw data")
print(f"Raw data shape: {raw_sensor_data.shape}")

# Step 2: Data cleaning and analysis
valid_readings = (raw_sensor_data > 15) & (raw_sensor_data < 35)
hourly_averages = np.mean(raw_sensor_data, axis=1)
daily_trend = np.gradient(hourly_averages)
quality_score = np.mean(valid_readings)

print(f"Step 2: Analysis complete")
print(f"Data quality: {quality_score:.1%}")
print(f"Temperature trend: {np.mean(daily_trend):.3f}°C/hour")

# Step 3: Save intermediate results
temp_dir = tempfile.gettempdir()
pipeline_file = os.path.join(temp_dir, 'sensor_analysis.npz')

analysis_results = {
    'raw_data': raw_sensor_data,
    'hourly_averages': hourly_averages,
    'daily_trend': daily_trend,
    'quality_score': np.array([quality_score]),
    'timestamps': timestamps,
    'metadata': np.array(['2024-01-15', '10_sensors', 'calibrated'], dtype='U20')
}

np.savez_compressed(pipeline_file, **analysis_results)
print(f"Step 3: Saved analysis results")

# Step 4: Load for reporting (simulate new session)
print(f"\nStep 4: Generate reports (new session)")
loaded_analysis = np.load(pipeline_file)

print(f"📊 Analysis Summary:")
print(f"Data quality: {loaded_analysis['quality_score'][0]:.1%}")
print(f"Average temperature: {np.mean(loaded_analysis['hourly_averages']):.1f}°C")
print(f"Temperature range: {np.min(loaded_analysis['hourly_averages']):.1f} - {np.max(loaded_analysis['hourly_averages']):.1f}°C")

# Export summary for sharing
summary_path = os.path.join(temp_dir, 'temperature_summary.csv')
summary_data = np.column_stack([
    loaded_analysis['timestamps'],
    loaded_analysis['hourly_averages'],
    loaded_analysis['daily_trend']
])

np.savetxt(summary_path, summary_data,
           delimiter=',',
           fmt=['%d', '%.2f', '%.4f'],
           header='Hour,Temperature,Trend',
           comments='')

print(f"📤 Exported summary for sharing: temperature_summary.csv")

# Clean up
loaded_analysis.close()
os.remove(pipeline_file)
os.remove(summary_path)

Data Backup and Versioning

import numpy as np
import tempfile
import os
from datetime import datetime

print("🗂️ Data Backup and Versioning Example")

# Simulate evolving analysis
def create_analysis_v1():
    return {
        'sales_data': np.array([1200, 1350, 1180, 1420]),
        'growth_rates': np.array([0.12, 0.15, -0.05, 0.20]),
        'version': np.array(['1.0'], dtype='U10'),
        'created_date': np.array(['2024-01-15'], dtype='U10')
    }

def create_analysis_v2():
    return {
        'sales_data': np.array([1200, 1350, 1180, 1420, 1290, 1380]),
        'growth_rates': np.array([0.12, 0.15, -0.05, 0.20, -0.09, 0.07]),
        'forecast': np.array([1450, 1500, 1520]),  # New feature
        'confidence_intervals': np.array([50, 60, 70]),  # New feature
        'version': np.array(['2.0'], dtype='U10'),
        'created_date': np.array(['2024-01-22'], dtype='U10')
    }

temp_dir = tempfile.gettempdir()

# Save version 1
analysis_v1 = create_analysis_v1()
v1_file = os.path.join(temp_dir, 'sales_analysis_v1.0.npz')
np.savez_compressed(v1_file, **analysis_v1)
print(f"✅ Saved version 1.0")

# Save version 2
analysis_v2 = create_analysis_v2()
v2_file = os.path.join(temp_dir, 'sales_analysis_v2.0.npz')
np.savez_compressed(v2_file, **analysis_v2)
print(f"✅ Saved version 2.0")

# Load and compare versions
v1_data = np.load(v1_file)
v2_data = np.load(v2_file)

print(f"\n📊 Version Comparison:")
print(f"V1 data points: {len(v1_data['sales_data'])}")
print(f"V2 data points: {len(v2_data['sales_data'])}")
print(f"V1 features: {list(v1_data.keys())}")
print(f"V2 features: {list(v2_data.keys())}")

# Show evolution
common_data = v1_data['sales_data']
v1_total = np.sum(common_data)
v2_total = np.sum(v2_data['sales_data'][:len(common_data)])

print(f"\nAnalysis Evolution:")
print(f"V1 total sales: ${v1_total:,}")
print(f"V2 total sales (same period): ${v2_total:,}")
print(f"Data consistency: {np.array_equal(common_data, v2_data['sales_data'][:len(common_data)])}")

if 'forecast' in v2_data:
    print(f"V2 adds forecasting: {v2_data['forecast']}")

# Clean up
v1_data.close()
v2_data.close()
os.remove(v1_file)
os.remove(v2_file)

🛠️ Best Practices

File Organization

import numpy as np
import tempfile
import os

print("📁 File Organization Best Practices")

# Organize by analysis type and date
temp_dir = tempfile.gettempdir()
project_dir = os.path.join(temp_dir, 'customer_analysis_2024')

# Simulate directory structure
file_structure = {
    'raw_data': 'customer_raw_data_2024_01_15.npz',
    'processed': 'customer_processed_2024_01_15.npz', 
    'results': 'customer_analysis_results_2024_01_15.npz',
    'reports': 'customer_summary_2024_01_15.csv'
}

print(f"Recommended file structure:")
for category, filename in file_structure.items():
    print(f"  {category}/: {filename}")

# Example data
sample_data = {
    'customer_ids': np.array([1001, 1002, 1003]),
    'scores': np.array([8.5, 7.2, 9.1]),
    'analysis_date': np.array(['2024-01-15'], dtype='U10')
}

# Save with descriptive names
descriptive_filename = os.path.join(temp_dir, 'customer_segmentation_analysis_v2.1_2024-01-15.npz')
np.savez_compressed(descriptive_filename, **sample_data)

print(f"\n✅ Saved with descriptive name:")
print(f"  {os.path.basename(descriptive_filename)}")
print(f"  Contains: project_type_version_date")

os.remove(descriptive_filename)

Error Handling

import numpy as np
import tempfile
import os

print("⚠️ Error Handling Best Practices")

def safe_save_analysis(data_dict, filename):
    """Safely save analysis with backup and validation."""
    try:
        # Validate data before saving
        for name, array in data_dict.items():
            if not isinstance(array, np.ndarray):
                raise ValueError(f"{name} is not a NumPy array")
            if array.size == 0:
                raise ValueError(f"{name} is empty")
        
        # Create backup if file exists
        if os.path.exists(filename):
            backup_name = filename.replace('.npz', '_backup.npz')
            if os.path.exists(backup_name):
                os.remove(backup_name)
            os.rename(filename, backup_name)
            print(f"📦 Created backup: {os.path.basename(backup_name)}")
        
        # Save new data
        np.savez_compressed(filename, **data_dict)
        print(f"✅ Successfully saved: {os.path.basename(filename)}")
        return True
        
    except Exception as e:
        print(f"❌ Save failed: {e}")
        return False

def safe_load_analysis(filename):
    """Safely load analysis with error handling."""
    try:
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File not found: {filename}")
        
        data = np.load(filename)
        print(f"✅ Successfully loaded: {os.path.basename(filename)}")
        print(f"   Contains: {list(data.keys())}")
        return data
        
    except Exception as e:
        print(f"❌ Load failed: {e}")
        return None

# Test error handling
temp_dir = tempfile.gettempdir()
test_file = os.path.join(temp_dir, 'test_analysis.npz')

# Valid data
valid_data = {
    'sales': np.array([1200, 1350, 1180]),
    'dates': np.array(['2024-01', '2024-02', '2024-03'], dtype='U10')
}

# Test saving
success = safe_save_analysis(valid_data, test_file)

if success:
    # Test loading
    loaded = safe_load_analysis(test_file)
    if loaded:
        loaded.close()

# Test with invalid data
invalid_data = {
    'empty_array': np.array([]),  # This will cause an error
    'valid_data': np.array([1, 2, 3])
}

print(f"\nTesting with invalid data:")
safe_save_analysis(invalid_data, test_file)

# Clean up
if os.path.exists(test_file):
    os.remove(test_file)

🎯 Key Takeaways

🎉 Congratulations!

You've mastered NumPy file operations! You can now:

Save and load arrays in multiple formats
Build robust data processing pipelines
Share analysis results effectively
Implement proper data backup strategies

🚀 What's Next?

You've completed the entire NumPy tutorial series! 🎉

You've mastered:

✅ Array creation and manipulation
✅ Mathematical operations and functions
✅ Indexing and data selection
✅ Aggregation and analysis
✅ Advanced features and optimization
✅ File operations and data persistence

Ready for more? Explore pandas for data analysis, matplotlib for visualization, or dive into machine learning with scikit-learn!

Congratulations on becoming a NumPy expert! 🏆

Online Python

💾 Saving and Loading Arrays

Track Your Learning Progress