🔗 Merging and Join Operations

Merging is like connecting the dots between related data! When you have DataFrames that share common information (like customer IDs or product codes), merging helps you combine them based on these relationships. This is essential for comprehensive data analysis.

Think of merging like looking up information in different address books - you use a person's name to find their phone number in one book and their address in another.

import pandas as pd

# Customer information
customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'city': ['NYC', 'LA', 'Chicago', 'Boston']
})

# Order information
orders = pd.DataFrame({
    'order_id': [101, 102, 103],
    'customer_id': [1, 2, 1],
    'amount': [250, 150, 300]
})

print("Customers:")
print(customers)
print()
print("Orders:")
print(orders)
print()

# Merge customer info with orders
customer_orders = orders.merge(customers, on='customer_id')
print("Orders with customer details:")
print(customer_orders)

🎯 Understanding Merge Operations

Merging combines DataFrames based on common columns (keys):

🔄 Types of Joins

Different join types determine which records are included in the result:

Inner Join Example

import pandas as pd

# Product catalog
products = pd.DataFrame({
    'product_id': [1, 2, 3, 4],
    'name': ['Laptop', 'Mouse', 'Keyboard', 'Monitor'],
    'category': ['Electronics', 'Accessories', 'Accessories', 'Electronics']
})

# Sales data (not all products sold)
sales = pd.DataFrame({
    'product_id': [1, 2, 1, 3],
    'quantity': [2, 5, 1, 3],
    'revenue': [2000, 125, 1000, 225]
})

print("Products:")
print(products)
print()
print("Sales:")
print(sales)
print()

# Inner join - only products that were sold
inner_result = sales.merge(products, on='product_id', how='inner')
print("Inner join - only sold products:")
print(inner_result)

Left Join Example

import pandas as pd

# Customer list
customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana']
})

# Orders (not all customers placed orders)
orders = pd.DataFrame({
    'customer_id': [1, 1, 2],
    'order_amount': [100, 150, 200]
})

print("Customers:")
print(customers)
print()
print("Orders:")
print(orders)
print()

# Left join - all customers, even those without orders
left_result = customers.merge(orders, on='customer_id', how='left')
print("Left join - all customers:")
print(left_result)

Outer Join Example

import pandas as pd

# Q1 sales
q1_sales = pd.DataFrame({
    'product': ['Laptop', 'Mouse', 'Keyboard'],
    'q1_sales': [10, 25, 15]
})

# Q2 sales (some different products)
q2_sales = pd.DataFrame({
    'product': ['Laptop', 'Monitor', 'Tablet'],
    'q2_sales': [12, 8, 5]
})

print("Q1 sales:")
print(q1_sales)
print()
print("Q2 sales:")
print(q2_sales)
print()

# Outer join - all products from both quarters
outer_result = q1_sales.merge(q2_sales, on='product', how='outer')
print("Outer join - all products:")
print(outer_result)

🏷️ Different Column Names

When join keys have different names in each DataFrame:

import pandas as pd

# Employee table
employees = pd.DataFrame({
    'emp_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'department': ['Sales', 'IT', 'HR']
})

# Salary table (uses different column name)
salaries = pd.DataFrame({
    'employee_id': [1, 2, 3],
    'salary': [50000, 75000, 60000],
    'bonus': [5000, 8000, 6000]
})

print("Employees:")
print(employees)
print()
print("Salaries:")
print(salaries)
print()

# Merge with different column names
emp_salary = employees.merge(salaries, 
                           left_on='emp_id', 
                           right_on='employee_id')
print("Employee salaries:")
print(emp_salary)

📊 Multiple Key Columns

Sometimes you need to match on multiple columns:

import pandas as pd

# Sales targets
targets = pd.DataFrame({
    'region': ['North', 'South', 'North', 'South'],
    'quarter': ['Q1', 'Q1', 'Q2', 'Q2'],
    'target': [100000, 90000, 110000, 95000]
})

# Actual sales
actuals = pd.DataFrame({
    'region': ['North', 'South', 'North', 'South'],
    'quarter': ['Q1', 'Q1', 'Q2', 'Q2'],
    'actual': [105000, 88000, 115000, 98000]
})

print("Targets:")
print(targets)
print()
print("Actuals:")
print(actuals)
print()

# Merge on multiple columns
performance = targets.merge(actuals, on=['region', 'quarter'])
print("Performance vs targets:")
print(performance)
print()

# Calculate performance percentage
performance['achievement'] = (performance['actual'] / performance['target'] * 100).round(1)
print("With achievement percentage:")
print(performance)

🔧 Handling Duplicate Column Names

When both DataFrames have columns with the same name (other than join keys):

import pandas as pd

# Budget data
budget = pd.DataFrame({
    'department': ['Sales', 'IT', 'HR'],
    'amount': [50000, 30000, 25000]
})

# Actual spending
actual = pd.DataFrame({
    'department': ['Sales', 'IT', 'HR'],
    'amount': [52000, 28000, 26000]
})

print("Budget:")
print(budget)
print()
print("Actual:")
print(actual)
print()

# Merge with custom suffixes
budget_vs_actual = budget.merge(actual, 
                               on='department', 
                               suffixes=('_budget', '_actual'))
print("Budget vs actual spending:")
print(budget_vs_actual)
print()

# Calculate variance
budget_vs_actual['variance'] = budget_vs_actual['amount_actual'] - budget_vs_actual['amount_budget']
print("With variance:")
print(budget_vs_actual)

📈 Real-World Example: Customer Analysis

Let's combine multiple data sources for comprehensive customer analysis:

import pandas as pd

# Customer demographics
demographics = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'age_group': ['25-34', '35-44', '18-24', '45-54', '25-34'],
    'region': ['North', 'South', 'East', 'West', 'North']
})

# Purchase history
purchases = pd.DataFrame({
    'customer_id': [1, 1, 2, 3, 4],
    'purchase_amount': [250, 150, 300, 75, 500],
    'product_category': ['Electronics', 'Books', 'Clothing', 'Books', 'Electronics']
})

# Customer service interactions
service = pd.DataFrame({
    'customer_id': [1, 2, 4],
    'satisfaction_score': [4.5, 3.8, 4.9],
    'support_tickets': [1, 3, 0]
})

print("Demographics:")
print(demographics)
print()
print("Purchases:")
print(purchases)
print()
print("Service interactions:")
print(service)
print()

# Step 1: Merge purchases with demographics
purchase_demo = purchases.merge(demographics, on='customer_id', how='left')
print("Purchases with demographics:")
print(purchase_demo)
print()

# Step 2: Add service data (left join to keep all purchase records)
complete_analysis = purchase_demo.merge(service, on='customer_id', how='left')
print("Complete customer analysis:")
print(complete_analysis)
print()

# Quick analysis: Average purchase by region
regional_avg = complete_analysis.groupby('region')['purchase_amount'].mean()
print("Average purchase amount by region:")
print(regional_avg.round(2))

🎯 Merge Validation

You can validate your merge to catch common issues:

import pandas as pd

# Customer master data (should be unique)
customers = pd.DataFrame({
    'customer_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
})

# Orders (one customer can have multiple orders)
orders = pd.DataFrame({
    'customer_id': [1, 1, 2, 3],
    'order_amount': [100, 150, 200, 75]
})

print("Customers (unique):")
print(customers)
print()
print("Orders (multiple per customer):")
print(orders)
print()

# Validate one-to-many relationship
try:
    validated_merge = customers.merge(orders, 
                                    on='customer_id', 
                                    validate='one_to_many')
    print("Merge validation passed!")
    print(validated_merge)
except Exception as e:
    print(f"Validation failed: {e}")

🎯 Key Takeaways

🚀 What's Next?

Excellent! You now know how to combine data using both concatenation and merging. Next, let's explore working with dates and times - essential skills for time-based analysis.

Continue to: Working with Dates

Keep merging! 🔗📊

Online Python

🔗 Merging and Join Operations

Track Your Learning Progress