📊 Statistical Operations

NumPy provides powerful statistical functions to analyze and understand your data! From basic descriptive statistics to advanced correlation analysis, these tools help extract insights from numerical data.

import numpy as np

# Statistical operations overview
test_scores = np.array([85, 92, 78, 96, 89, 74, 88, 93, 82, 90])
print(f"Test scores: {test_scores}")

# Basic statistics
print(f"Mean: {np.mean(test_scores):.1f}")
print(f"Median: {np.median(test_scores):.1f}")
print(f"Std Dev: {np.std(test_scores):.2f}")
print(f"Range: {np.ptp(test_scores)}")  # peak-to-peak

📈 Descriptive Statistics

Understand your data distribution with central tendency and spread measures.

Central Tendency

import numpy as np

grades = np.array([78, 85, 92, 88, 79, 94, 87, 91, 83, 89])

# Measures of central tendency
mean_grade = np.mean(grades)
median_grade = np.median(grades)

print(f"Grades: {grades}")
print(f"Mean (average): {mean_grade:.1f}")
print(f"Median (middle): {median_grade:.1f}")

# Find mode (most frequent)
unique, counts = np.unique(grades, return_counts=True)
mode_idx = np.argmax(counts)
print(f"Most common: {unique[mode_idx]} (appears {counts[mode_idx]} times)")

Measures of Spread

import numpy as np

sales = np.array([1200, 1350, 1180, 1420, 1290, 1380])

# Spread measures
std_dev = np.std(sales)
variance = np.var(sales)
range_val = np.ptp(sales)  # max - min

print(f"Sales: {sales}")
print(f"Standard deviation: {std_dev:.1f}")
print(f"Variance: {variance:.1f}")
print(f"Range: {range_val}")
print(f"Min/Max: {np.min(sales)}, {np.max(sales)}")

Percentiles and Quartiles

import numpy as np

response_times = np.array([120, 150, 180, 200, 230, 250, 280, 320, 350, 400])

# Quartile analysis
q1 = np.percentile(response_times, 25)
q2 = np.percentile(response_times, 50)  # median
q3 = np.percentile(response_times, 75)

print(f"Response times: {response_times}")
print(f"Q1 (25th): {q1}")
print(f"Q2 (50th): {q2}")  
print(f"Q3 (75th): {q3}")
print(f"IQR: {q3 - q1}")

📊 Multi-dimensional Statistics

Analyze statistics along specific axes of arrays.

By Rows and Columns

import numpy as np

# Student scores: rows=students, cols=subjects
scores = np.array([[85, 92, 78, 88],   # Alice
                   [79, 85, 91, 82],   # Bob
                   [94, 89, 96, 93],   # Carol
                   [72, 78, 74, 76]])  # David

students = ['Alice', 'Bob', 'Carol', 'David']
subjects = ['Math', 'Science', 'English', 'History']

# Statistics by student (axis=1)
student_avgs = np.mean(scores, axis=1)
print(f"Student averages: {student_avgs.round(1)}")

# Statistics by subject (axis=0)  
subject_avgs = np.mean(scores, axis=0)
print(f"Subject averages: {subject_avgs.round(1)}")

Finding Best Performers

import numpy as np

scores = np.array([[85, 92, 78, 88],
                   [79, 85, 91, 82], 
                   [94, 89, 96, 93],
                   [72, 78, 74, 76]])

students = ['Alice', 'Bob', 'Carol', 'David']

# Best overall student
totals = np.sum(scores, axis=1)
best_idx = np.argmax(totals)
print(f"Top student: {students[best_idx]} ({totals[best_idx]} total)")

# Subject averages
subject_avgs = np.mean(scores, axis=0)
best_subject = np.argmax(subject_avgs)
subjects = ['Math', 'Science', 'English', 'History']
print(f"Best subject: {subjects[best_subject]} ({subject_avgs[best_subject]:.1f})")

🔗 Correlation Analysis

Measure relationships between variables.

Correlation Coefficient

import numpy as np

# Study hours vs test scores
study_hours = np.array([2, 4, 3, 6, 5, 8, 7, 9, 1, 10])
test_scores = np.array([65, 75, 70, 85, 80, 95, 90, 98, 60, 100])

# Calculate correlation
correlation = np.corrcoef(study_hours, test_scores)[0, 1]

print(f"Study hours: {study_hours}")
print(f"Test scores: {test_scores}")
print(f"Correlation: {correlation:.3f}")

if correlation > 0.7:
    print("Strong positive correlation!")
elif correlation > 0.3:
    print("Moderate positive correlation")
else:
    print("Weak correlation")

Correlation Matrix

import numpy as np

# Multiple variables: height, weight, age
data = np.array([[170, 65, 25],   # Person 1
                 [175, 70, 30],   # Person 2  
                 [165, 60, 22],   # Person 3
                 [180, 80, 35],   # Person 4
                 [160, 55, 20]])  # Person 5

# Correlation matrix
corr_matrix = np.corrcoef(data.T)

print(f"Correlation matrix:")
variables = ['Height', 'Weight', 'Age']
for i, var in enumerate(variables):
    print(f"{var}: {corr_matrix[i].round(3)}")

📊 Distribution Analysis

Analyze how data is distributed.

Histogram Analysis

import numpy as np

exam_scores = np.array([65, 67, 70, 72, 75, 78, 80, 82, 85, 87,
                       89, 90, 92, 94, 95, 96, 98, 85, 88, 91])

# Create histogram
hist, bin_edges = np.histogram(exam_scores, bins=5)

print(f"Score distribution:")
for i in range(len(hist)):
    range_str = f"{bin_edges[i]:.0f}-{bin_edges[i+1]:.0f}"
    print(f"{range_str}: {hist[i]} students")

# Skewness indicator
skew = np.mean(exam_scores) - np.median(exam_scores)
print(f"Skewness indicator: {skew:.2f}")

Outlier Detection

import numpy as np

# Data with outliers
data = np.array([120, 115, 130, 125, 140, 135, 128, 2000, 122, 138])

# IQR method
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

# Outlier boundaries
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

# Find outliers
outliers = (data < lower) | (data > upper)

print(f"Data: {data}")
print(f"Outlier bounds: [{lower:.1f}, {upper:.1f}]")
print(f"Outliers: {data[outliers]}")

🧮 Advanced Statistics

More sophisticated statistical operations.

Weighted Statistics

import numpy as np

# Grades with weights
grades = np.array([85, 92, 78, 96])
weights = np.array([0.2, 0.3, 0.3, 0.2])  # Quiz, Midterm, Final, Project

# Weighted average
weighted_avg = np.average(grades, weights=weights)

print(f"Grades: {grades}")
print(f"Weights: {weights}")
print(f"Weighted average: {weighted_avg:.1f}")
print(f"Regular average: {np.mean(grades):.1f}")

Cumulative Statistics

import numpy as np

daily_sales = np.array([120, 150, 130, 180, 160, 200, 175])

# Cumulative operations
cumsum = np.cumsum(daily_sales)
running_avg = cumsum / np.arange(1, len(daily_sales) + 1)

print(f"Daily sales: {daily_sales}")
print(f"Cumulative total: {cumsum}")
print(f"Running average: {running_avg.round(1)}")

# Growth rate
growth = np.diff(daily_sales) / daily_sales[:-1] * 100
print(f"Daily growth %: {growth.round(1)}")

🧠 Business Analytics Example

import numpy as np

# Quarterly revenue comparison
q1_revenue = np.array([50000, 45000, 60000, 55000, 48000])
q2_revenue = np.array([55000, 50000, 65000, 60000, 52000])

# Compare quarters
q1_mean, q1_std = np.mean(q1_revenue), np.std(q1_revenue)
q2_mean, q2_std = np.mean(q2_revenue), np.std(q2_revenue)

print(f"Q1 - Mean: ${q1_mean:,.0f}, Std: ${q1_std:,.0f}")
print(f"Q2 - Mean: ${q2_mean:,.0f}, Std: ${q2_std:,.0f}")

# Growth analysis
growth = ((q2_mean - q1_mean) / q1_mean) * 100
print(f"Quarter-over-quarter growth: {growth:.1f}%")

🎯 Key Takeaways

🚀 What's Next?

Master statistical analysis! Now explore linear algebra for matrix operations.

Continue to: Linear Algebra Operations

Online Python

📊 Statistical Operations

Track Your Learning Progress