🔍 Working with Regular Expressions

Regular expressions (regex) are powerful patterns for finding, extracting, and manipulating text. They're essential for data validation, text processing, and extracting information from unstructured data.

import re

# Basic pattern matching
text = "My email is john@example.com and phone is (555) 123-4567"

# Find email addresses
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)

# Find phone numbers
phone_pattern = r'\(\d{3}\) \d{3}-\d{4}'
phones = re.findall(phone_pattern, text)

print(f"Found emails: {emails}")
print(f"Found phones: {phones}")

# Check if text matches pattern
is_valid_email = re.match(email_pattern, "user@domain.com")
print(f"Valid email: {is_valid_email is not None}")

🎯 Understanding Regular Expressions

Regular expressions use special characters to define search patterns that can match various text combinations.

Basic Pattern Matching

import re

text = "The year 2024 was great, better than 2023"

# Find all 4-digit numbers (years)
years = re.findall(r'\d{4}', text)
print(f"Years found: {years}")

# Check if text starts with "The"
starts_with_the = re.match(r'The', text)
print(f"Starts with 'The': {starts_with_the is not None}")

# Find first occurrence of a year
first_year = re.search(r'\d{4}', text)
if first_year:
    print(f"First year: {first_year.group()}")
    print(f"Position: {first_year.start()}-{first_year.end()}")

Text Replacement with Regex

import re

# Replace patterns in text
text = "Call us at 555-123-4567 or 555-987-6543"

# Replace phone numbers with masked version
masked = re.sub(r'\d{3}-\d{3}-\d{4}', 'XXX-XXX-XXXX', text)
print(f"Masked: {masked}")

# Replace with capture groups
text2 = "John Smith, Jane Doe, Bob Johnson"
# Swap first and last names
swapped = re.sub(r'(\w+) (\w+)', r'\2, \1', text2)
print(f"Swapped: {swapped}")

# Remove extra whitespace
messy_text = "Hello    world   with    spaces"
cleaned = re.sub(r'\s+', ' ', messy_text)
print(f"Cleaned: {cleaned}")

📋 Regex Pattern Reference

Pattern	Meaning	Example
`.`	Any character	`a.c` matches "abc", "a1c"
`\d`	Any digit	`\d{3}` matches "123"
`\w`	Word character	`\w+` matches "hello"
`\s`	Whitespace	`\s+` matches spaces/tabs
`*`	Zero or more	`a*` matches "", "a", "aa"
`+`	One or more	`a+` matches "a", "aa"
`?`	Zero or one	`a?` matches "", "a"
`[]`	Character set	`[abc]` matches "a", "b", "c"
`()`	Group	`(abc)+` matches "abc", "abcabc"

🔧 Common Regex Patterns

Email Validation

import re

def validate_email(email):
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

def extract_emails(text):
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.findall(pattern, text)

# Test email validation
emails_to_test = [
    "user@example.com",
    "invalid.email",
    "test@domain.co.uk",
    "bad@email"
]

for email in emails_to_test:
    valid = validate_email(email)
    print(f"{email}: {'Valid' if valid else 'Invalid'}")

# Extract emails from text
text = "Contact us at support@company.com or sales@company.org"
found_emails = extract_emails(text)
print(f"Found emails: {found_emails}")

Phone Number Processing

import re

def format_phone(phone):
    # Remove all non-digits
    digits = re.sub(r'\D', '', phone)
    
    # Check if it's a valid 10-digit US number
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    elif len(digits) == 11 and digits[0] == '1':
        return f"({digits[1:4]}) {digits[4:7]}-{digits[7:]}"
    else:
        return "Invalid phone number"

def extract_phone_numbers(text):
    # Pattern for various phone formats
    patterns = [
        r'\(\d{3}\) \d{3}-\d{4}',  # (555) 123-4567
        r'\d{3}-\d{3}-\d{4}',      # 555-123-4567
        r'\d{3}\.\d{3}\.\d{4}',    # 555.123.4567
        r'\d{10}'                   # 5551234567
    ]
    
    phones = []
    for pattern in patterns:
        phones.extend(re.findall(pattern, text))
    return phones

# Test phone formatting
phone_numbers = [
    "5551234567",
    "1-555-123-4567",
    "(555) 123-4567",
    "555.123.4567"
]

for phone in phone_numbers:
    formatted = format_phone(phone)
    print(f"{phone} → {formatted}")

# Extract from text
text = "Call (555) 123-4567 or 555.987.6543 for support"
found_phones = extract_phone_numbers(text)
print(f"Found phones: {found_phones}")

URL and File Path Processing

import re

def extract_urls(text):
    # Simple URL pattern
    pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    return re.findall(pattern, text)

def parse_filename(filepath):
    # Extract filename components
    pattern = r'(.*/)?([^/]+?)(\.[^.]*)?$'
    match = re.match(pattern, filepath)
    
    if match:
        return {
            'path': match.group(1) or '',
            'name': match.group(2),
            'extension': match.group(3) or ''
        }
    return None

def clean_filename(filename):
    # Remove invalid characters for filenames
    cleaned = re.sub(r'[<>:"/\\|?*]', '_', filename)
    # Remove multiple underscores
    cleaned = re.sub(r'_{2,}', '_', cleaned)
    return cleaned.strip('_')

# Test URL extraction
text = "Visit https://example.com or http://test.org for more info"
urls = extract_urls(text)
print(f"Found URLs: {urls}")

# Test filename parsing
filepaths = [
    "/home/user/document.pdf",
    "image.jpg",
    "/path/to/file",
    "archive.tar.gz"
]

for filepath in filepaths:
    parsed = parse_filename(filepath)
    print(f"{filepath} → {parsed}")

# Test filename cleaning
dirty_names = [
    "My Document <version 2>.pdf",
    "file/with\\bad:chars.txt",
    "normal_filename.doc"
]

for name in dirty_names:
    clean = clean_filename(name)
    print(f"'{name}' → '{clean}'")

Log File Processing

import re
from datetime import datetime

def parse_log_entry(log_line):
    # Parse common log format: timestamp [LEVEL] message
    pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)'
    match = re.match(pattern, log_line)
    
    if match:
        timestamp_str, level, message = match.groups()
        return {
            'timestamp': timestamp_str,
            'level': level,
            'message': message
        }
    return None

def extract_ip_addresses(text):
    # IPv4 pattern
    pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    return re.findall(pattern, text)

def find_error_codes(text):
    # Common HTTP error codes
    pattern = r'\b[4-5]\d{2}\b'
    return re.findall(pattern, text)

# Sample log entries
log_entries = [
    "2024-01-15 10:30:45 [INFO] User 192.168.1.100 logged in",
    "2024-01-15 10:31:12 [ERROR] 404 error for /missing-page",
    "2024-01-15 10:31:45 [WARN] High CPU usage detected"
]

print("Parsed log entries:")
for entry in log_entries:
    parsed = parse_log_entry(entry)
    if parsed:
        print(f"  {parsed['level']}: {parsed['message']}")

# Extract data from logs
all_logs = " ".join(log_entries)
ip_addresses = extract_ip_addresses(all_logs)
error_codes = find_error_codes(all_logs)

print(f"\nIP addresses found: {ip_addresses}")
print(f"Error codes found: {error_codes}")

📊 Regex Flags Reference

Flag	Purpose	Example
`re.IGNORECASE`	Case-insensitive	`re.findall(r'hello', text, re.IGNORECASE)`
`re.MULTILINE`	`^` and `$` match line starts/ends	`re.findall(r'^ERROR', text, re.MULTILINE)`
`re.DOTALL`	`.` matches newlines too	`re.findall(r'start.*end', text, re.DOTALL)`

Using Regex Flags

import re

text = """INFO: Application started
ERROR: Database connection failed
WARNING: Low memory
ERROR: User authentication failed"""

# Case-insensitive search
errors_any_case = re.findall(r'error', text, re.IGNORECASE)
print(f"Errors (any case): {errors_any_case}")

# Find lines starting with ERROR
error_lines = re.findall(r'^ERROR:.*', text, re.MULTILINE)
print(f"Error lines: {error_lines}")

# Count different log levels
levels = re.findall(r'^(\w+):', text, re.MULTILINE)
level_counts = {}
for level in levels:
    level_counts[level] = level_counts.get(level, 0) + 1

print(f"Log level counts: {level_counts}")

🎯 Key Takeaways

🚀 What's Next?

Learn how to parse structured markup languages like XML and HTML to extract data from web content.

Continue to: Parse XML and HTML

Online Python

🔍 Working with Regular Expressions

Track Your Learning Progress