🔧 Working with NaN Values
Missing data is a reality in real-world datasets! NumPy's NaN (Not a Number) handling provides robust tools for detecting, filtering, and analyzing incomplete data without breaking your calculations.
import numpy as np
# Dataset with missing values
test_scores = np.array([85.0, np.nan, 92.0, 78.0, np.nan, 96.0, 89.0, 74.0])
students = ['Alice', 'Bob', 'Carol', 'David', 'Eve', 'Frank', 'Grace', 'Henry']
print(f"Test scores: {test_scores}")
print(f"Missing scores: {np.sum(np.isnan(test_scores))}")
print(f"Class average (ignoring NaN): {np.nanmean(test_scores):.1f}")
# Find students with valid scores
valid_mask = ~np.isnan(test_scores)
valid_students = np.array(students)[valid_mask]
valid_scores = test_scores[valid_mask]
print(f"Valid scores: {valid_students} - {valid_scores}")
🔍 Understanding NaN Values
NaN represents missing or undefined data without breaking calculations.
Creating and Detecting NaN
import numpy as np
# Different ways to create NaN
missing_value = np.nan
undefined_calc = 0.0 / 0.0 # Results in NaN
invalid_sqrt = np.sqrt(-1) # Results in NaN (warning shown)
# Create array with NaN
data = np.array([1.0, 2.0, np.nan, 4.0, 5.0])
print(f"Data with NaN: {data}")
# Detect NaN values
nan_mask = np.isnan(data)
print(f"NaN mask: {nan_mask}")
print(f"Number of NaN values: {np.sum(nan_mask)}")
print(f"Number of valid values: {np.sum(~nan_mask)}")
# Check individual values
print(f"Is data[2] NaN? {np.isnan(data[2])}")
print(f"Is data[0] NaN? {np.isnan(data[0])}")
NaN Properties
import numpy as np
# NaN comparison behavior
nan_val = np.nan
# NaN is not equal to anything, including itself!
print(f"NaN == NaN: {nan_val == nan_val}") # False!
print(f"NaN != NaN: {nan_val != nan_val}") # True!
print(f"NaN == 5: {nan_val == 5}") # False
# This is why we need np.isnan()
print(f"np.isnan(NaN): {np.isnan(nan_val)}") # True
# NaN in calculations
numbers = np.array([1, 2, np.nan, 4, 5])
print(f"Sum with NaN: {np.sum(numbers)}") # Results in NaN
print(f"Mean with NaN: {np.mean(numbers)}") # Results in NaN
print(f"Max with NaN: {np.max(numbers)}") # Results in NaN
📊 NaN-Aware Functions
Use special functions that handle NaN values gracefully.
Basic NaN-Aware Statistics
import numpy as np
# Sales data with missing days
daily_sales = np.array([1200, 1350, np.nan, 1420, 1290, np.nan, 1380, 1250])
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon']
print(f"Daily sales: {daily_sales}")
print(f"Days tracked: {days}")
# Regular functions return NaN
print(f"Regular mean: {np.mean(daily_sales)}")
print(f"Regular sum: {np.sum(daily_sales)}")
# NaN-aware functions ignore NaN
print(f"NaN-aware mean: {np.nanmean(daily_sales):.0f}")
print(f"NaN-aware sum: {np.nansum(daily_sales):.0f}")
print(f"NaN-aware max: {np.nanmax(daily_sales):.0f}")
print(f"NaN-aware min: {np.nanmin(daily_sales):.0f}")
print(f"NaN-aware std: {np.nanstd(daily_sales):.0f}")
Complete NaN Function Set
import numpy as np
# Product ratings with missing data
ratings = np.array([4.5, np.nan, 3.8, 4.9, np.nan, 4.2, 3.5, 4.7, np.nan])
products = [f"Product-{i+1}" for i in range(len(ratings))]
print(f"Product ratings: {ratings}")
# Statistical analysis ignoring NaN
stats = {
'count': np.sum(~np.isnan(ratings)),
'mean': np.nanmean(ratings),
'median': np.nanmedian(ratings),
'std': np.nanstd(ratings),
'min': np.nanmin(ratings),
'max': np.nanmax(ratings),
'percentile_25': np.nanpercentile(ratings, 25),
'percentile_75': np.nanpercentile(ratings, 75)
}
print(f"\n📊 Rating Statistics (ignoring NaN):")
for metric, value in stats.items():
if metric == 'count':
print(f"{metric}: {value}")
else:
print(f"{metric}: {value:.2f}")
# Data quality
missing_count = np.sum(np.isnan(ratings))
completion_rate = (len(ratings) - missing_count) / len(ratings)
print(f"\nData quality: {completion_rate:.1%} complete ({missing_count} missing)")
🔧 Filtering and Cleaning Data
Remove or handle NaN values based on your analysis needs.
Removing NaN Values
import numpy as np
# Survey responses with missing data
responses = np.array([7, np.nan, 8, 6, np.nan, 9, 5, np.nan, 8, 7])
respondent_ids = np.array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])
print(f"Original responses: {responses}")
print(f"Respondent IDs: {respondent_ids}")
# Method 1: Filter out NaN values
valid_mask = ~np.isnan(responses)
clean_responses = responses[valid_mask]
clean_ids = respondent_ids[valid_mask]
print(f"\nAfter removing NaN:")
print(f"Clean responses: {clean_responses}")
print(f"Valid respondents: {clean_ids}")
# Method 2: Use boolean indexing directly
finite_responses = responses[np.isfinite(responses)] # Removes NaN and inf
print(f"Finite responses: {finite_responses}")
# Analysis with clean data
print(f"\nAnalysis:")
print(f"Response rate: {len(clean_responses)}/{len(responses)} ({len(clean_responses)/len(responses):.1%})")
print(f"Average response: {np.mean(clean_responses):.1f}")
print(f"Response range: {np.min(clean_responses):.0f} - {np.max(clean_responses):.0f}")
Filling Missing Values
import numpy as np
# Temperature data with sensor failures
temperatures = np.array([22.1, 23.5, np.nan, 24.2, np.nan, 25.1, 23.8, np.nan])
hours = np.arange(len(temperatures))
print(f"Temperature readings: {temperatures}")
# Method 1: Fill with mean
mean_temp = np.nanmean(temperatures)
filled_with_mean = np.where(np.isnan(temperatures), mean_temp, temperatures)
print(f"Filled with mean ({mean_temp:.1f}): {filled_with_mean.round(1)}")
# Method 2: Forward fill (use last valid value)
filled_forward = temperatures.copy()
for i in range(1, len(filled_forward)):
if np.isnan(filled_forward[i]):
filled_forward[i] = filled_forward[i-1]
print(f"Forward filled: {filled_forward.round(1)}")
# Method 3: Interpolation (simple linear)
valid_indices = ~np.isnan(temperatures)
if np.sum(valid_indices) >= 2:
# Linear interpolation between valid points
filled_interp = np.interp(hours, hours[valid_indices], temperatures[valid_indices])
print(f"Interpolated: {filled_interp.round(1)}")
📈 Multi-dimensional NaN Handling
Work with missing data in 2D arrays and matrices.
Array-wise NaN Analysis
import numpy as np
# Student grades: rows=students, cols=subjects
grades = np.array([[85, np.nan, 92, 88], # Alice
[79, 85, np.nan, 82], # Bob
[np.nan, 89, 96, 93], # Carol
[72, np.nan, 74, np.nan]]) # David
students = ['Alice', 'Bob', 'Carol', 'David']
subjects = ['Math', 'Science', 'English', 'History']
print(f"Grades matrix:")
print(grades)
# Analyze by student (rows)
print(f"\n👨🎓 By Student:")
for i, student in enumerate(students):
student_grades = grades[i, :]
valid_count = np.sum(~np.isnan(student_grades))
if valid_count > 0:
avg_grade = np.nanmean(student_grades)
print(f"{student}: {valid_count}/4 subjects, avg: {avg_grade:.1f}")
else:
print(f"{student}: No valid grades")
# Analyze by subject (columns)
print(f"\n📚 By Subject:")
for j, subject in enumerate(subjects):
subject_grades = grades[:, j]
valid_count = np.sum(~np.isnan(subject_grades))
if valid_count > 0:
avg_grade = np.nanmean(subject_grades)
print(f"{subject}: {valid_count}/4 students, avg: {avg_grade:.1f}")
else:
print(f"{subject}: No valid grades")
Row and Column Filtering
import numpy as np
# Sales data: rows=regions, cols=months
sales_data = np.array([[120, 135, np.nan, 160], # North
[np.nan, np.nan, np.nan, np.nan], # South (all missing)
[156, 167, 175, 185], # East
[89, np.nan, 105, 125]]) # West
regions = ['North', 'South', 'East', 'West']
months = ['Jan', 'Feb', 'Mar', 'Apr']
print(f"Sales data:")
print(sales_data)
# Find rows/regions with any valid data
rows_with_data = ~np.all(np.isnan(sales_data), axis=1)
valid_regions = np.array(regions)[rows_with_data]
clean_sales = sales_data[rows_with_data]
print(f"\nRegions with data: {valid_regions}")
print(f"Clean sales data:")
print(clean_sales)
# Find columns/months with any valid data
cols_with_data = ~np.all(np.isnan(sales_data), axis=0)
valid_months = np.array(months)[cols_with_data]
print(f"Months with data: {valid_months}")
# Complete analysis (only regions and months with data)
complete_data = sales_data[np.ix_(rows_with_data, cols_with_data)]
print(f"\nComplete data subset:")
print(complete_data)
print(f"Overall average: {np.nanmean(complete_data):.1f}")
🚨 Data Quality Assessment
Evaluate the completeness and reliability of your datasets.
Completeness Report
import numpy as np
# Customer survey data
survey_data = np.array([
[25, 4, np.nan, 1], # Customer 1
[np.nan, 5, 3, 1], # Customer 2
[35, np.nan, 4, np.nan], # Customer 3
[28, 3, 2, 0], # Customer 4
[np.nan, np.nan, np.nan, np.nan], # Customer 5 (no responses)
[42, 4, 5, 1] # Customer 6
])
questions = ['Age', 'Satisfaction', 'Recommendation', 'Will_Return']
print(f"📊 Data Quality Report:")
print(f"Total responses: {survey_data.shape[0]}")
print(f"Total questions: {survey_data.shape[1]}")
# Overall completeness
total_cells = survey_data.size
missing_cells = np.sum(np.isnan(survey_data))
completeness = (total_cells - missing_cells) / total_cells
print(f"Overall completeness: {completeness:.1%}")
print(f"Missing values: {missing_cells}/{total_cells}")
# Per-question completeness
print(f"\n📋 Question Completeness:")
for i, question in enumerate(questions):
col_data = survey_data[:, i]
valid_responses = np.sum(~np.isnan(col_data))
completion_rate = valid_responses / len(col_data)
print(f"{question}: {completion_rate:.1%} ({valid_responses}/{len(col_data)})")
# Per-respondent completeness
print(f"\n👥 Respondent Completeness:")
complete_responses = 0
for i in range(survey_data.shape[0]):
row_data = survey_data[i, :]
valid_answers = np.sum(~np.isnan(row_data))
completion_rate = valid_answers / len(row_data)
print(f"Customer {i+1}: {completion_rate:.1%} ({valid_answers}/{len(row_data)})")
if completion_rate == 1.0:
complete_responses += 1
print(f"\nFully complete responses: {complete_responses}/{survey_data.shape[0]} ({complete_responses/survey_data.shape[0]:.1%})")
🎯 Key Takeaways
🚀 What's Next?
Master missing data handling! Now learn performance optimization for large-scale data processing.
Continue to: Performance Optimization
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.