🔄 Data Type Conversion
Data type conversion is like making sure you're using the right tools for the job - you can't do math with text, and you can't format numbers as dates! Getting data types right is essential for accurate analysis and prevents errors in calculations.
🔍 Checking Current Data Types
First, understand what types you're working with:
import pandas as pd
# Mixed-up data types (common real-world scenario)
messy_data = pd.DataFrame({
'customer_id': ['1', '2', '3', '4', '5'], # Should be numbers
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'], # Correct (text)
'age': ['25', '30', '35', '28', '32'], # Should be numbers
'signup_date': ['2023-01-15', '2023-02-10', '2023-01-25', '2023-03-05', '2023-02-20'], # Should be dates
'is_premium': ['True', 'False', 'True', 'False', 'True'], # Should be boolean
'total_spent': ['150.50', '89.25', '200.00', '45.75', '175.80'] # Should be numbers
})
print("📊 Data with Mixed-Up Types:")
print(messy_data)
print()
print("🔍 Current Data Types:")
print(messy_data.dtypes)
print()
print("Problems:")
print("- customer_id is text, should be number")
print("- age is text, should be number")
print("- signup_date is text, should be date")
print("- is_premium is text, should be True/False")
print("- total_spent is text, should be decimal number")
print()
print("Let's fix these types!")
🔢 Converting to Numbers
Convert text that represents numbers into actual numeric types:
import pandas as pd
# Sales data with numbers stored as text
sales = pd.DataFrame({
'product_id': ['101', '102', '103', '104'],
'price': ['29.99', '49.95', '19.99', '39.99'],
'quantity': ['5', '2', '8', '3'],
'discount_percent': ['10', '15', '5', '20']
})
print("📊 Sales Data (Numbers as Text):")
print(sales)
print("Data types:", sales.dtypes.to_dict())
print()
print("🔢 Converting to Numbers:")
print()
# Method 1: astype() - when you're sure data is clean
print("1️⃣ Using astype() for clean conversions:")
converted = sales.copy()
converted['product_id'] = converted['product_id'].astype('int')
converted['price'] = converted['price'].astype('float')
converted['quantity'] = converted['quantity'].astype('int')
print("Converted data:")
print(converted)
print("New types:", converted.dtypes.to_dict())
print()
# Method 2: pd.to_numeric() - safer for messy data
print("2️⃣ Using pd.to_numeric() (safer for real-world data):")
safer_conversion = sales.copy()
safer_conversion['product_id'] = pd.to_numeric(safer_conversion['product_id'])
safer_conversion['price'] = pd.to_numeric(safer_conversion['price'])
safer_conversion['quantity'] = pd.to_numeric(safer_conversion['quantity'])
print("Safely converted:")
print(safer_conversion.dtypes.to_dict())
📅 Converting to Dates
Transform text dates into proper datetime objects:
import pandas as pd
# Event data with dates as text
events = pd.DataFrame({
'event_id': [1, 2, 3, 4],
'event_name': ['Conference', 'Workshop', 'Seminar', 'Meeting'],
'start_date': ['2023-03-15', '2023-04-01', '2023-03-22', '2023-04-10'],
'end_date': ['2023-03-16', '2023-04-01', '2023-03-22', '2023-04-10'],
'registration_deadline': ['2023-03-01', '2023-03-25', '2023-03-15', '2023-04-05']
})
print("📊 Event Data (Dates as Text):")
print(events)
print("Date column types:", events[['start_date', 'end_date']].dtypes.to_dict())
print()
print("📅 Converting to Dates:")
print()
print("1️⃣ Convert start_date to datetime:")
events['start_date'] = pd.to_datetime(events['start_date'])
print("start_date type:", events['start_date'].dtype)
print(events['start_date'])
print()
print("2️⃣ Convert multiple date columns:")
date_columns = ['end_date', 'registration_deadline']
for col in date_columns:
events[col] = pd.to_datetime(events[col])
print("All date types:")
print(events.dtypes)
print()
print("3️⃣ Now we can do date operations:")
events['days_until_event'] = (events['start_date'] - pd.Timestamp.now()).dt.days
events['event_duration'] = (events['end_date'] - events['start_date']).dt.days + 1
print("Date calculations:")
print(events[['event_name', 'start_date', 'event_duration']])
✅ Converting to Boolean
Transform Yes/No, True/False text into actual boolean values:
import pandas as pd
# Survey data with boolean values as text
survey = pd.DataFrame({
'respondent_id': [1, 2, 3, 4, 5],
'satisfied': ['Yes', 'No', 'Yes', 'Yes', 'No'],
'would_recommend': ['True', 'False', 'True', 'False', 'True'],
'completed_survey': ['1', '0', '1', '1', '0'],
'opted_in_emails': ['y', 'n', 'Y', 'N', 'y']
})
print("📊 Survey Data (Booleans as Text):")
print(survey)
print()
print("✅ Converting to Boolean:")
print()
print("1️⃣ Yes/No to True/False:")
survey['satisfied'] = survey['satisfied'].map({'Yes': True, 'No': False})
print("Satisfied column:")
print(survey['satisfied'])
print()
print("2️⃣ True/False strings to boolean:")
survey['would_recommend'] = survey['would_recommend'] == 'True'
print("Would recommend column:")
print(survey['would_recommend'])
print()
print("3️⃣ 1/0 to True/False:")
survey['completed_survey'] = survey['completed_survey'].astype('bool')
print("Completed survey column:")
print(survey['completed_survey'])
print()
print("4️⃣ y/n variations to boolean:")
survey['opted_in_emails'] = survey['opted_in_emails'].str.lower().isin(['y', 'yes'])
print("Opted in emails column:")
print(survey['opted_in_emails'])
print()
print("Final data with proper boolean types:")
print(survey.dtypes)
🏷️ Converting to Categories
Convert repetitive text to efficient categorical data:
import pandas as pd
# Customer data with repetitive categories
customers = pd.DataFrame({
'customer_id': [1, 2, 3, 4, 5, 6],
'membership_level': ['Bronze', 'Silver', 'Gold', 'Bronze', 'Silver', 'Bronze'],
'region': ['North', 'South', 'East', 'North', 'West', 'South'],
'preferred_contact': ['Email', 'Phone', 'Email', 'SMS', 'Email', 'Phone']
})
print("📊 Customer Data with Repetitive Categories:")
print(customers)
print()
print("Memory usage before conversion:")
print(customers.memory_usage(deep=True))
print()
print("🏷️ Converting to Categories:")
print()
print("1️⃣ Convert membership_level to category:")
customers['membership_level'] = customers['membership_level'].astype('category')
print("Membership levels:")
print(customers['membership_level'])
print("Categories:", customers['membership_level'].cat.categories.tolist())
print()
print("2️⃣ Convert all text columns to categories:")
text_columns = ['region', 'preferred_contact']
for col in text_columns:
customers[col] = customers[col].astype('category')
print("Data types after conversion:")
print(customers.dtypes)
print()
print("Memory usage after conversion:")
print(customers.memory_usage(deep=True))
print()
print("💡 Benefits of categories:")
print("- Less memory usage")
print("- Faster operations")
print("- Ordered categories possible")
print("- Prevents typos in analysis")
🛠️ Handling Conversion Errors
Real data is messy - handle conversion errors gracefully:
import pandas as pd
import numpy as np
# Messy data with conversion challenges
messy_numbers = pd.DataFrame({
'id': ['1', '2', '3', '4', '5'],
'score': ['85', '92', 'N/A', '78', '88'], # Contains N/A
'percentage': ['15%', '23%', '8%', 'missing', '45%'], # Contains % and text
'amount': ['$100.50', '$75.25', 'free', '$250.00', '$125.75'] # Contains $ and text
})
print("📊 Messy Data with Conversion Challenges:")
print(messy_numbers)
print()
print("🛠️ Safe Conversion Strategies:")
print()
print("1️⃣ Handle N/A values in scores:")
# Convert 'N/A' to NaN, then to numeric
messy_numbers['score_clean'] = messy_numbers['score'].replace('N/A', np.nan)
messy_numbers['score_clean'] = pd.to_numeric(messy_numbers['score_clean'])
print("Clean scores:")
print(messy_numbers[['score', 'score_clean']])
print()
print("2️⃣ Clean percentage data:")
# Remove % symbol and convert
messy_numbers['percentage_clean'] = messy_numbers['percentage'].str.replace('%', '')
messy_numbers['percentage_clean'] = pd.to_numeric(
messy_numbers['percentage_clean'], errors='coerce'
)
print("Clean percentages:")
print(messy_numbers[['percentage', 'percentage_clean']])
print()
print("3️⃣ Clean currency data:")
# Remove $ symbol and handle 'free'
messy_numbers['amount_clean'] = messy_numbers['amount'].str.replace('$', '')
messy_numbers['amount_clean'] = messy_numbers['amount_clean'].replace('free', '0')
messy_numbers['amount_clean'] = pd.to_numeric(messy_numbers['amount_clean'])
print("Clean amounts:")
print(messy_numbers[['amount', 'amount_clean']])
print()
print("4️⃣ Summary of conversions:")
print("Successful conversions:", messy_numbers['score_clean'].notna().sum(), "out of", len(messy_numbers))
print("Data types:", messy_numbers[['score_clean', 'percentage_clean', 'amount_clean']].dtypes.to_dict())
🎯 Complete Type Conversion Workflow
A systematic approach to fixing all data types:
import pandas as pd
# Complete messy dataset
raw_data = pd.DataFrame({
'order_id': ['1001', '1002', '1003', '1004'],
'customer_name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Lee'],
'order_date': ['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18'],
'product_category': ['Electronics', 'Books', 'Electronics', 'Clothing'],
'quantity': ['2', '1', '3', '1'],
'unit_price': ['299.99', '19.95', '149.99', '49.99'],
'is_express_shipping': ['True', 'False', 'True', 'False'],
'customer_rating': ['5', '4', '5', '3']
})
print("📊 Raw Data (All Wrong Types):")
print(raw_data)
print("Original types:", raw_data.dtypes.to_dict())
print()
print("🎯 Complete Type Conversion Workflow:")
print()
# Step 1: Create a copy for cleaning
cleaned_data = raw_data.copy()
# Step 2: Convert IDs to integers
print("1️⃣ Convert IDs to integers:")
cleaned_data['order_id'] = pd.to_numeric(cleaned_data['order_id'])
# Step 3: Convert dates
print("2️⃣ Convert dates:")
cleaned_data['order_date'] = pd.to_datetime(cleaned_data['order_date'])
# Step 4: Convert categories
print("3️⃣ Convert categories:")
cleaned_data['product_category'] = cleaned_data['product_category'].astype('category')
# Step 5: Convert numeric columns
print("4️⃣ Convert numeric columns:")
cleaned_data['quantity'] = pd.to_numeric(cleaned_data['quantity'])
cleaned_data['unit_price'] = pd.to_numeric(cleaned_data['unit_price'])
cleaned_data['customer_rating'] = pd.to_numeric(cleaned_data['customer_rating'])
# Step 6: Convert boolean
print("5️⃣ Convert boolean:")
cleaned_data['is_express_shipping'] = cleaned_data['is_express_shipping'] == 'True'
print("✅ Final cleaned data:")
print(cleaned_data)
print()
print("New types:", cleaned_data.dtypes.to_dict())
print()
print("🎯 Now we can do proper analysis:")
print(f"Average order value: ${(cleaned_data['quantity'] * cleaned_data['unit_price']).mean():.2f}")
print(f"Express shipping rate: {cleaned_data['is_express_shipping'].mean():.1%}")
print(f"Average rating: {cleaned_data['customer_rating'].mean():.1f}/5")
⚠️ Type Conversion Pitfalls
Handle messy real-world data safely to avoid conversion errors:
import pandas as pd
# Data demonstrating safe conversion practices
problematic_data = pd.DataFrame({
'mixed_numbers': ['100', '200.5', 'N/A', '300'],
'dates_with_errors': ['2023-01-15', '2023-02-30', '2023-03-15', 'invalid'],
'yes_no_maybe': ['Yes', 'No', 'Maybe', 'Yes']
})
print("📊 Problematic Data:")
print(problematic_data)
print()
print("✅ Safe Conversion Approach:")
print()
# Safe numeric conversion
safe_numbers = pd.to_numeric(problematic_data['mixed_numbers'], errors='coerce')
print("Safe numeric conversion:")
print(safe_numbers)
print(f"Failed conversions: {safe_numbers.isnull().sum()}")
print()
# Safe date conversion
safe_dates = pd.to_datetime(problematic_data['dates_with_errors'], errors='coerce')
print("Safe date conversion:")
print(safe_dates)
print(f"Failed conversions: {safe_dates.isnull().sum()}")
print()
print("🎯 Always validate your conversion results!")
🎯 Key Takeaways
🎮 Practice Challenge
Apply type conversion to a realistic business dataset:
import pandas as pd
# Customer analytics data export (typical format from many systems)
analytics_export = pd.DataFrame({
'user_id': ['10001', '10002', '10003', '10004', '10005'],
'signup_date': ['2023-01-15', '2023-02-10', '2023-01-25', '2023-03-05', '2023-02-20'],
'subscription_type': ['Premium', 'Basic', 'Premium', 'Basic', 'Premium'],
'monthly_revenue': ['29.99', '9.99', '29.99', '9.99', '29.99'],
'is_active': ['1', '1', '0', '1', '1'],
'last_login_days_ago': ['3', '15', '45', '7', '1'],
'total_sessions': ['125', '45', '8', '67', '189'],
'conversion_rate': ['15.5%', '8.2%', '3.1%', '12.8%', '18.9%']
})
print("🎮 Analytics Export Challenge:")
print(analytics_export)
print()
print("Mission: Convert all columns to appropriate types for analysis")
print()
# Solution
print("🔄 Conversion Solution:")
cleaned_analytics = analytics_export.copy()
print("Converting each column...")
# User ID to integer
cleaned_analytics['user_id'] = pd.to_numeric(cleaned_analytics['user_id'])
# Signup date to datetime
cleaned_analytics['signup_date'] = pd.to_datetime(cleaned_analytics['signup_date'])
# Subscription type to category
cleaned_analytics['subscription_type'] = cleaned_analytics['subscription_type'].astype('category')
# Revenue to float
cleaned_analytics['monthly_revenue'] = pd.to_numeric(cleaned_analytics['monthly_revenue'])
# Active status to boolean
cleaned_analytics['is_active'] = cleaned_analytics['is_active'].astype('bool')
# Login days to integer
cleaned_analytics['last_login_days_ago'] = pd.to_numeric(cleaned_analytics['last_login_days_ago'])
# Sessions to integer
cleaned_analytics['total_sessions'] = pd.to_numeric(cleaned_analytics['total_sessions'])
# Conversion rate (remove % and convert)
cleaned_analytics['conversion_rate'] = cleaned_analytics['conversion_rate'].str.replace('%', '')
cleaned_analytics['conversion_rate'] = pd.to_numeric(cleaned_analytics['conversion_rate'])
print("✅ Conversion complete!")
print("New data types:")
print(cleaned_analytics.dtypes)
print()
print("🎯 Now we can do real analysis:")
print(f"Active users: {cleaned_analytics['is_active'].sum()}/{len(cleaned_analytics)}")
print(f"Average revenue: ${cleaned_analytics['monthly_revenue'].mean():.2f}")
print(f"Average conversion rate: {cleaned_analytics['conversion_rate'].mean():.1f}%")
print(f"Days since signup (average): {(pd.Timestamp.now() - cleaned_analytics['signup_date']).dt.days.mean():.0f} days")
🚀 What's Next?
Fantastic! You now know how to convert data to the right types for accurate analysis. Let's complete our data cleaning journey by learning how to clean and standardize text data.
Continue to: String Cleaning Operations
You're becoming a data type expert! 🔄✨
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.