🚀 Advanced Topics
Ready to level up your NumPy skills? Advanced topics unlock powerful capabilities for real-world data science challenges including generating random data for testing, handling missing values gracefully, and optimizing performance for large datasets.
import numpy as np
# Advanced NumPy capabilities preview
np.random.seed(42) # Reproducible random numbers
# Generate sample data
data = np.random.normal(100, 15, 1000) # Normal distribution
data[np.random.choice(1000, 50, replace=False)] = np.nan # Add missing values
print(f"Dataset shape: {data.shape}")
print(f"Valid values: {np.sum(~np.isnan(data))}")
print(f"Missing values: {np.sum(np.isnan(data))}")
print(f"Mean (ignoring NaN): {np.nanmean(data):.2f}")
🎯 Why Advanced Topics Matter
Advanced NumPy features become essential when working with real data:
- Random Data Generation 🎲: Create test datasets, simulations, and sampling
- Missing Data Handling 🔧: Deal with incomplete or corrupted data gracefully
- Performance Optimization ⚡: Handle large datasets efficiently
🧩 Advanced Feature Overview
Random Number Generation
Create controlled randomness for testing and simulations.
import numpy as np
# Set seed for reproducibility
np.random.seed(42)
# Different types of random data
random_integers = np.random.randint(1, 101, 5) # Random integers 1-100
random_floats = np.random.random(5) # Random floats 0-1
normal_data = np.random.normal(50, 10, 5) # Normal distribution
print(f"Random integers: {random_integers}")
print(f"Random floats: {random_floats.round(3)}")
print(f"Normal distribution: {normal_data.round(1)}")
# Sample from existing data
choices = np.array(['A', 'B', 'C', 'D', 'E'])
sample = np.random.choice(choices, 3, replace=False)
print(f"Random sample: {sample}")
NaN (Missing Data) Handling
Work with incomplete datasets effectively.
import numpy as np
# Data with missing values
scores = np.array([85.0, np.nan, 92.0, 78.0, np.nan, 96.0, 89.0])
print(f"Test scores: {scores}")
# Handle missing data
valid_scores = scores[~np.isnan(scores)]
mean_score = np.nanmean(scores)
count_missing = np.sum(np.isnan(scores))
print(f"Valid scores: {valid_scores}")
print(f"Average (ignoring NaN): {mean_score:.1f}")
print(f"Missing values: {count_missing}")
# Fill missing values
filled_scores = np.where(np.isnan(scores), mean_score, scores)
print(f"Filled scores: {filled_scores.round(1)}")
Performance Optimization
Write efficient code for large datasets.
import numpy as np
import time
# Compare performance: loops vs vectorization
size = 100000
a = np.random.random(size)
b = np.random.random(size)
# Vectorized operation (efficient)
start_time = time.time()
result_vectorized = a * b + np.sin(a)
vectorized_time = time.time() - start_time
print(f"Dataset size: {size:,} elements")
print(f"Vectorized time: {vectorized_time:.4f} seconds")
print(f"Result shape: {result_vectorized.shape}")
# Memory efficiency tip
print(f"Memory usage: {result_vectorized.nbytes / 1024:.1f} KB")
🎲 Random Number Applications
Random numbers have many practical uses in data science.
Data Simulation
import numpy as np
# Simulate customer data
np.random.seed(123)
n_customers = 100
# Generate realistic customer profiles
ages = np.random.normal(35, 12, n_customers).astype(int)
ages = np.clip(ages, 18, 80) # Keep ages realistic
incomes = np.random.lognormal(10.5, 0.5, n_customers) # Income distribution
purchase_amounts = incomes * 0.001 + np.random.normal(0, 50, n_customers)
purchase_amounts = np.maximum(purchase_amounts, 10) # Minimum purchase
print(f"Customer simulation ({n_customers} customers):")
print(f"Age range: {ages.min()} - {ages.max()}")
print(f"Average income: ${incomes.mean():,.0f}")
print(f"Average purchase: ${purchase_amounts.mean():.2f}")
A/B Testing
import numpy as np
# A/B test simulation
np.random.seed(456)
# Control group (current design)
control_conversion_rate = 0.12
control_visitors = 1000
control_conversions = np.random.binomial(control_visitors, control_conversion_rate)
# Test group (new design)
test_conversion_rate = 0.15 # 3% improvement
test_visitors = 1000
test_conversions = np.random.binomial(test_visitors, test_conversion_rate)
print(f"A/B Test Results:")
print(f"Control: {control_conversions}/{control_visitors} = {control_conversions/control_visitors:.1%}")
print(f"Test: {test_conversions}/{test_visitors} = {test_conversions/test_visitors:.1%}")
improvement = (test_conversions/test_visitors) - (control_conversions/control_visitors)
print(f"Improvement: {improvement:.1%}")
🔧 Missing Data Challenges
Real-world data often has missing values that need careful handling.
Data Quality Analysis
import numpy as np
# Simulated survey data with missing responses
np.random.seed(789)
survey_responses = np.random.choice([1, 2, 3, 4, 5, np.nan],
size=(50, 4),
p=[0.1, 0.15, 0.25, 0.25, 0.15, 0.1])
questions = ['Satisfaction', 'Recommendation', 'Value', 'Support']
print(f"Survey Data Quality Report:")
print(f"Total responses: {survey_responses.shape[0]}")
for i, question in enumerate(questions):
col_data = survey_responses[:, i]
missing_count = np.sum(np.isnan(col_data))
completion_rate = (len(col_data) - missing_count) / len(col_data)
if missing_count == 0:
avg_score = np.mean(col_data)
else:
avg_score = np.nanmean(col_data)
print(f"{question}: {completion_rate:.1%} complete, avg: {avg_score:.1f}")
⚡ Performance Considerations
Understanding performance helps you work with large datasets efficiently.
Memory Management
import numpy as np
# Memory-efficient data types
large_integers = np.arange(1000000, dtype=np.int32) # 4 bytes per number
small_integers = np.arange(1000000, dtype=np.int8) # 1 byte per number (if range fits)
print(f"Memory usage comparison (1M numbers):")
print(f"int32: {large_integers.nbytes / 1024 / 1024:.1f} MB")
print(f"int8: {small_integers.nbytes / 1024 / 1024:.1f} MB")
print(f"Memory savings: {(1 - small_integers.nbytes/large_integers.nbytes)*100:.0f}%")
# Choose appropriate data types
ratings = np.array([1, 2, 3, 4, 5] * 100000, dtype=np.int8) # Ratings 1-5 fit in int8
prices = np.random.uniform(10, 1000, 100000).astype(np.float32) # 32-bit sufficient for prices
print(f"Ratings array: {ratings.nbytes / 1024:.1f} KB")
print(f"Prices array: {prices.nbytes / 1024:.1f} KB")
📚 What You'll Learn
This section covers essential advanced techniques:
- 🎲 Random Number Generation - Create test data, simulations, and sampling for analysis
- 🔧 Working with NaN Values - Handle missing data gracefully in real-world datasets
- ⚡ Performance Optimization - Write efficient code for large-scale data processing
🧠 Real-World Applications
Data Science Workflow
import numpy as np
# Typical data science workflow with advanced features
np.random.seed(2023)
# 1. Generate or load data (with missing values)
raw_data = np.random.normal(100, 20, 1000)
raw_data[np.random.choice(1000, 100, replace=False)] = np.nan
# 2. Data quality assessment
total_points = len(raw_data)
missing_points = np.sum(np.isnan(raw_data))
completion_rate = (total_points - missing_points) / total_points
print(f"📊 Data Quality Assessment:")
print(f"Total data points: {total_points:,}")
print(f"Missing values: {missing_points} ({missing_points/total_points:.1%})")
print(f"Completion rate: {completion_rate:.1%}")
# 3. Handle missing data
clean_data = raw_data[~np.isnan(raw_data)]
mean_value = np.nanmean(raw_data)
print(f"\n📈 Data Summary:")
print(f"Mean (clean data): {np.mean(clean_data):.2f}")
print(f"Std deviation: {np.std(clean_data):.2f}")
print(f"Data range: {np.min(clean_data):.1f} - {np.max(clean_data):.1f}")
Business Intelligence
import numpy as np
# Simulate business metrics with realistic challenges
np.random.seed(2024)
# Sales data with seasonal patterns and missing values
months = 12
base_sales = 10000
seasonal_factor = 1 + 0.3 * np.sin(np.arange(months) * 2 * np.pi / 12)
monthly_sales = base_sales * seasonal_factor + np.random.normal(0, 1000, months)
# Introduce some missing data (system downtime)
monthly_sales[np.random.choice(months, 2, replace=False)] = np.nan
print(f"📊 Monthly Sales Analysis:")
print(f"Available months: {np.sum(~np.isnan(monthly_sales))}/{months}")
if np.any(np.isnan(monthly_sales)):
projected_annual = np.nanmean(monthly_sales) * 12
print(f"Projected annual sales: ${projected_annual:,.0f}")
else:
actual_annual = np.sum(monthly_sales)
print(f"Actual annual sales: ${actual_annual:,.0f}")
# Identify best and worst performing months
valid_months = ~np.isnan(monthly_sales)
if np.any(valid_months):
best_month = np.nanargmax(monthly_sales) + 1
worst_month = np.nanargmin(monthly_sales) + 1
print(f"Best month: Month {best_month}")
print(f"Worst month: Month {worst_month}")
🎯 Key Benefits
🚀 Ready for Advanced NumPy?
Master NumPy's advanced features for professional data science work! Start with random number generation for creating test data and simulations.
Begin with: Random Number Generation
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.