🔧 Working with NaN Values

Missing data is a reality in real-world datasets! NumPy's NaN (Not a Number) handling provides robust tools for detecting, filtering, and analyzing incomplete data without breaking your calculations.

import numpy as np

# Dataset with missing values
test_scores = np.array([85.0, np.nan, 92.0, 78.0, np.nan, 96.0, 89.0, 74.0])
students = ['Alice', 'Bob', 'Carol', 'David', 'Eve', 'Frank', 'Grace', 'Henry']

print(f"Test scores: {test_scores}")
print(f"Missing scores: {np.sum(np.isnan(test_scores))}")
print(f"Class average (ignoring NaN): {np.nanmean(test_scores):.1f}")

# Find students with valid scores
valid_mask = ~np.isnan(test_scores)
valid_students = np.array(students)[valid_mask]
valid_scores = test_scores[valid_mask]
print(f"Valid scores: {valid_students} - {valid_scores}")

🔍 Understanding NaN Values

NaN represents missing or undefined data without breaking calculations.

Creating and Detecting NaN

import numpy as np

# Different ways to create NaN
missing_value = np.nan
undefined_calc = 0.0 / 0.0  # Results in NaN
invalid_sqrt = np.sqrt(-1)   # Results in NaN (warning shown)

# Create array with NaN
data = np.array([1.0, 2.0, np.nan, 4.0, 5.0])
print(f"Data with NaN: {data}")

# Detect NaN values
nan_mask = np.isnan(data)
print(f"NaN mask: {nan_mask}")
print(f"Number of NaN values: {np.sum(nan_mask)}")
print(f"Number of valid values: {np.sum(~nan_mask)}")

# Check individual values
print(f"Is data[2] NaN? {np.isnan(data[2])}")
print(f"Is data[0] NaN? {np.isnan(data[0])}")

NaN Properties

import numpy as np

# NaN comparison behavior
nan_val = np.nan

# NaN is not equal to anything, including itself!
print(f"NaN == NaN: {nan_val == nan_val}")      # False!
print(f"NaN != NaN: {nan_val != nan_val}")      # True!
print(f"NaN == 5: {nan_val == 5}")              # False

# This is why we need np.isnan()
print(f"np.isnan(NaN): {np.isnan(nan_val)}")    # True

# NaN in calculations
numbers = np.array([1, 2, np.nan, 4, 5])
print(f"Sum with NaN: {np.sum(numbers)}")       # Results in NaN
print(f"Mean with NaN: {np.mean(numbers)}")     # Results in NaN
print(f"Max with NaN: {np.max(numbers)}")       # Results in NaN

📊 NaN-Aware Functions

Use special functions that handle NaN values gracefully.

Basic NaN-Aware Statistics

import numpy as np

# Sales data with missing days
daily_sales = np.array([1200, 1350, np.nan, 1420, 1290, np.nan, 1380, 1250])
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'Mon']

print(f"Daily sales: {daily_sales}")
print(f"Days tracked: {days}")

# Regular functions return NaN
print(f"Regular mean: {np.mean(daily_sales)}")
print(f"Regular sum: {np.sum(daily_sales)}")

# NaN-aware functions ignore NaN
print(f"NaN-aware mean: {np.nanmean(daily_sales):.0f}")
print(f"NaN-aware sum: {np.nansum(daily_sales):.0f}")
print(f"NaN-aware max: {np.nanmax(daily_sales):.0f}")
print(f"NaN-aware min: {np.nanmin(daily_sales):.0f}")
print(f"NaN-aware std: {np.nanstd(daily_sales):.0f}")

Complete NaN Function Set

import numpy as np

# Product ratings with missing data
ratings = np.array([4.5, np.nan, 3.8, 4.9, np.nan, 4.2, 3.5, 4.7, np.nan])
products = [f"Product-{i+1}" for i in range(len(ratings))]

print(f"Product ratings: {ratings}")

# Statistical analysis ignoring NaN
stats = {
    'count': np.sum(~np.isnan(ratings)),
    'mean': np.nanmean(ratings),
    'median': np.nanmedian(ratings),
    'std': np.nanstd(ratings),
    'min': np.nanmin(ratings),
    'max': np.nanmax(ratings),
    'percentile_25': np.nanpercentile(ratings, 25),
    'percentile_75': np.nanpercentile(ratings, 75)
}

print(f"\n📊 Rating Statistics (ignoring NaN):")
for metric, value in stats.items():
    if metric == 'count':
        print(f"{metric}: {value}")
    else:
        print(f"{metric}: {value:.2f}")

# Data quality
missing_count = np.sum(np.isnan(ratings))
completion_rate = (len(ratings) - missing_count) / len(ratings)
print(f"\nData quality: {completion_rate:.1%} complete ({missing_count} missing)")

🔧 Filtering and Cleaning Data

Remove or handle NaN values based on your analysis needs.

Removing NaN Values

import numpy as np

# Survey responses with missing data
responses = np.array([7, np.nan, 8, 6, np.nan, 9, 5, np.nan, 8, 7])
respondent_ids = np.array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])

print(f"Original responses: {responses}")
print(f"Respondent IDs: {respondent_ids}")

# Method 1: Filter out NaN values
valid_mask = ~np.isnan(responses)
clean_responses = responses[valid_mask]
clean_ids = respondent_ids[valid_mask]

print(f"\nAfter removing NaN:")
print(f"Clean responses: {clean_responses}")
print(f"Valid respondents: {clean_ids}")

# Method 2: Use boolean indexing directly
finite_responses = responses[np.isfinite(responses)]  # Removes NaN and inf
print(f"Finite responses: {finite_responses}")

# Analysis with clean data
print(f"\nAnalysis:")
print(f"Response rate: {len(clean_responses)}/{len(responses)} ({len(clean_responses)/len(responses):.1%})")
print(f"Average response: {np.mean(clean_responses):.1f}")
print(f"Response range: {np.min(clean_responses):.0f} - {np.max(clean_responses):.0f}")

Filling Missing Values

import numpy as np

# Temperature data with sensor failures
temperatures = np.array([22.1, 23.5, np.nan, 24.2, np.nan, 25.1, 23.8, np.nan])
hours = np.arange(len(temperatures))

print(f"Temperature readings: {temperatures}")

# Method 1: Fill with mean
mean_temp = np.nanmean(temperatures)
filled_with_mean = np.where(np.isnan(temperatures), mean_temp, temperatures)
print(f"Filled with mean ({mean_temp:.1f}): {filled_with_mean.round(1)}")

# Method 2: Forward fill (use last valid value)
filled_forward = temperatures.copy()
for i in range(1, len(filled_forward)):
    if np.isnan(filled_forward[i]):
        filled_forward[i] = filled_forward[i-1]
print(f"Forward filled: {filled_forward.round(1)}")

# Method 3: Interpolation (simple linear)
valid_indices = ~np.isnan(temperatures)
if np.sum(valid_indices) >= 2:
    # Linear interpolation between valid points
    filled_interp = np.interp(hours, hours[valid_indices], temperatures[valid_indices])
    print(f"Interpolated: {filled_interp.round(1)}")

📈 Multi-dimensional NaN Handling

Work with missing data in 2D arrays and matrices.

Array-wise NaN Analysis

import numpy as np

# Student grades: rows=students, cols=subjects
grades = np.array([[85, np.nan, 92, 88],     # Alice
                   [79, 85, np.nan, 82],     # Bob  
                   [np.nan, 89, 96, 93],     # Carol
                   [72, np.nan, 74, np.nan]]) # David

students = ['Alice', 'Bob', 'Carol', 'David']
subjects = ['Math', 'Science', 'English', 'History']

print(f"Grades matrix:")
print(grades)

# Analyze by student (rows)
print(f"\n👨‍🎓 By Student:")
for i, student in enumerate(students):
    student_grades = grades[i, :]
    valid_count = np.sum(~np.isnan(student_grades))
    if valid_count > 0:
        avg_grade = np.nanmean(student_grades)
        print(f"{student}: {valid_count}/4 subjects, avg: {avg_grade:.1f}")
    else:
        print(f"{student}: No valid grades")

# Analyze by subject (columns)
print(f"\n📚 By Subject:")
for j, subject in enumerate(subjects):
    subject_grades = grades[:, j]
    valid_count = np.sum(~np.isnan(subject_grades))
    if valid_count > 0:
        avg_grade = np.nanmean(subject_grades)
        print(f"{subject}: {valid_count}/4 students, avg: {avg_grade:.1f}")
    else:
        print(f"{subject}: No valid grades")

Row and Column Filtering

import numpy as np

# Sales data: rows=regions, cols=months
sales_data = np.array([[120, 135, np.nan, 160],    # North
                      [np.nan, np.nan, np.nan, np.nan], # South (all missing)
                      [156, 167, 175, 185],           # East
                      [89, np.nan, 105, 125]])        # West

regions = ['North', 'South', 'East', 'West']
months = ['Jan', 'Feb', 'Mar', 'Apr']

print(f"Sales data:")
print(sales_data)

# Find rows/regions with any valid data
rows_with_data = ~np.all(np.isnan(sales_data), axis=1)
valid_regions = np.array(regions)[rows_with_data]
clean_sales = sales_data[rows_with_data]

print(f"\nRegions with data: {valid_regions}")
print(f"Clean sales data:")
print(clean_sales)

# Find columns/months with any valid data
cols_with_data = ~np.all(np.isnan(sales_data), axis=0)
valid_months = np.array(months)[cols_with_data]
print(f"Months with data: {valid_months}")

# Complete analysis (only regions and months with data)
complete_data = sales_data[np.ix_(rows_with_data, cols_with_data)]
print(f"\nComplete data subset:")
print(complete_data)
print(f"Overall average: {np.nanmean(complete_data):.1f}")

🚨 Data Quality Assessment

Evaluate the completeness and reliability of your datasets.

Completeness Report

import numpy as np

# Customer survey data
survey_data = np.array([
    [25, 4, np.nan, 1],      # Customer 1
    [np.nan, 5, 3, 1],       # Customer 2  
    [35, np.nan, 4, np.nan], # Customer 3
    [28, 3, 2, 0],           # Customer 4
    [np.nan, np.nan, np.nan, np.nan], # Customer 5 (no responses)
    [42, 4, 5, 1]            # Customer 6
])

questions = ['Age', 'Satisfaction', 'Recommendation', 'Will_Return']

print(f"📊 Data Quality Report:")
print(f"Total responses: {survey_data.shape[0]}")
print(f"Total questions: {survey_data.shape[1]}")

# Overall completeness
total_cells = survey_data.size
missing_cells = np.sum(np.isnan(survey_data))
completeness = (total_cells - missing_cells) / total_cells

print(f"Overall completeness: {completeness:.1%}")
print(f"Missing values: {missing_cells}/{total_cells}")

# Per-question completeness
print(f"\n📋 Question Completeness:")
for i, question in enumerate(questions):
    col_data = survey_data[:, i]
    valid_responses = np.sum(~np.isnan(col_data))
    completion_rate = valid_responses / len(col_data)
    print(f"{question}: {completion_rate:.1%} ({valid_responses}/{len(col_data)})")

# Per-respondent completeness
print(f"\n👥 Respondent Completeness:")
complete_responses = 0
for i in range(survey_data.shape[0]):
    row_data = survey_data[i, :]
    valid_answers = np.sum(~np.isnan(row_data))
    completion_rate = valid_answers / len(row_data)
    print(f"Customer {i+1}: {completion_rate:.1%} ({valid_answers}/{len(row_data)})")
    if completion_rate == 1.0:
        complete_responses += 1

print(f"\nFully complete responses: {complete_responses}/{survey_data.shape[0]} ({complete_responses/survey_data.shape[0]:.1%})")

🎯 Key Takeaways

🚀 What's Next?

Master missing data handling! Now learn performance optimization for large-scale data processing.

Continue to: Performance Optimization

Online Python

🔧 Working with NaN Values

Track Your Learning Progress