🔍 Working with Regular Expressions
Regular expressions (regex) are powerful patterns for finding, extracting, and manipulating text. They're essential for data validation, text processing, and extracting information from unstructured data.
import re
# Basic pattern matching
text = "My email is john@example.com and phone is (555) 123-4567"
# Find email addresses
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
# Find phone numbers
phone_pattern = r'\(\d{3}\) \d{3}-\d{4}'
phones = re.findall(phone_pattern, text)
print(f"Found emails: {emails}")
print(f"Found phones: {phones}")
# Check if text matches pattern
is_valid_email = re.match(email_pattern, "user@domain.com")
print(f"Valid email: {is_valid_email is not None}")
🎯 Understanding Regular Expressions
Regular expressions use special characters to define search patterns that can match various text combinations.
Basic Pattern Matching
import re
text = "The year 2024 was great, better than 2023"
# Find all 4-digit numbers (years)
years = re.findall(r'\d{4}', text)
print(f"Years found: {years}")
# Check if text starts with "The"
starts_with_the = re.match(r'The', text)
print(f"Starts with 'The': {starts_with_the is not None}")
# Find first occurrence of a year
first_year = re.search(r'\d{4}', text)
if first_year:
print(f"First year: {first_year.group()}")
print(f"Position: {first_year.start()}-{first_year.end()}")
Text Replacement with Regex
import re
# Replace patterns in text
text = "Call us at 555-123-4567 or 555-987-6543"
# Replace phone numbers with masked version
masked = re.sub(r'\d{3}-\d{3}-\d{4}', 'XXX-XXX-XXXX', text)
print(f"Masked: {masked}")
# Replace with capture groups
text2 = "John Smith, Jane Doe, Bob Johnson"
# Swap first and last names
swapped = re.sub(r'(\w+) (\w+)', r'\2, \1', text2)
print(f"Swapped: {swapped}")
# Remove extra whitespace
messy_text = "Hello world with spaces"
cleaned = re.sub(r'\s+', ' ', messy_text)
print(f"Cleaned: {cleaned}")
📋 Regex Pattern Reference
Pattern | Meaning | Example |
---|---|---|
. | Any character | a.c matches "abc", "a1c" |
\d | Any digit | \d{3} matches "123" |
\w | Word character | \w+ matches "hello" |
\s | Whitespace | \s+ matches spaces/tabs |
* | Zero or more | a* matches "", "a", "aa" |
+ | One or more | a+ matches "a", "aa" |
? | Zero or one | a? matches "", "a" |
[] | Character set | [abc] matches "a", "b", "c" |
() | Group | (abc)+ matches "abc", "abcabc" |
🔧 Common Regex Patterns
Email Validation
import re
def validate_email(email):
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return re.match(pattern, email) is not None
def extract_emails(text):
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
return re.findall(pattern, text)
# Test email validation
emails_to_test = [
"user@example.com",
"invalid.email",
"test@domain.co.uk",
"bad@email"
]
for email in emails_to_test:
valid = validate_email(email)
print(f"{email}: {'Valid' if valid else 'Invalid'}")
# Extract emails from text
text = "Contact us at support@company.com or sales@company.org"
found_emails = extract_emails(text)
print(f"Found emails: {found_emails}")
Phone Number Processing
import re
def format_phone(phone):
# Remove all non-digits
digits = re.sub(r'\D', '', phone)
# Check if it's a valid 10-digit US number
if len(digits) == 10:
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
elif len(digits) == 11 and digits[0] == '1':
return f"({digits[1:4]}) {digits[4:7]}-{digits[7:]}"
else:
return "Invalid phone number"
def extract_phone_numbers(text):
# Pattern for various phone formats
patterns = [
r'\(\d{3}\) \d{3}-\d{4}', # (555) 123-4567
r'\d{3}-\d{3}-\d{4}', # 555-123-4567
r'\d{3}\.\d{3}\.\d{4}', # 555.123.4567
r'\d{10}' # 5551234567
]
phones = []
for pattern in patterns:
phones.extend(re.findall(pattern, text))
return phones
# Test phone formatting
phone_numbers = [
"5551234567",
"1-555-123-4567",
"(555) 123-4567",
"555.123.4567"
]
for phone in phone_numbers:
formatted = format_phone(phone)
print(f"{phone} → {formatted}")
# Extract from text
text = "Call (555) 123-4567 or 555.987.6543 for support"
found_phones = extract_phone_numbers(text)
print(f"Found phones: {found_phones}")
URL and File Path Processing
import re
def extract_urls(text):
# Simple URL pattern
pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
return re.findall(pattern, text)
def parse_filename(filepath):
# Extract filename components
pattern = r'(.*/)?([^/]+?)(\.[^.]*)?$'
match = re.match(pattern, filepath)
if match:
return {
'path': match.group(1) or '',
'name': match.group(2),
'extension': match.group(3) or ''
}
return None
def clean_filename(filename):
# Remove invalid characters for filenames
cleaned = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Remove multiple underscores
cleaned = re.sub(r'_{2,}', '_', cleaned)
return cleaned.strip('_')
# Test URL extraction
text = "Visit https://example.com or http://test.org for more info"
urls = extract_urls(text)
print(f"Found URLs: {urls}")
# Test filename parsing
filepaths = [
"/home/user/document.pdf",
"image.jpg",
"/path/to/file",
"archive.tar.gz"
]
for filepath in filepaths:
parsed = parse_filename(filepath)
print(f"{filepath} → {parsed}")
# Test filename cleaning
dirty_names = [
"My Document <version 2>.pdf",
"file/with\\bad:chars.txt",
"normal_filename.doc"
]
for name in dirty_names:
clean = clean_filename(name)
print(f"'{name}' → '{clean}'")
Log File Processing
import re
from datetime import datetime
def parse_log_entry(log_line):
# Parse common log format: timestamp [LEVEL] message
pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)'
match = re.match(pattern, log_line)
if match:
timestamp_str, level, message = match.groups()
return {
'timestamp': timestamp_str,
'level': level,
'message': message
}
return None
def extract_ip_addresses(text):
# IPv4 pattern
pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
return re.findall(pattern, text)
def find_error_codes(text):
# Common HTTP error codes
pattern = r'\b[4-5]\d{2}\b'
return re.findall(pattern, text)
# Sample log entries
log_entries = [
"2024-01-15 10:30:45 [INFO] User 192.168.1.100 logged in",
"2024-01-15 10:31:12 [ERROR] 404 error for /missing-page",
"2024-01-15 10:31:45 [WARN] High CPU usage detected"
]
print("Parsed log entries:")
for entry in log_entries:
parsed = parse_log_entry(entry)
if parsed:
print(f" {parsed['level']}: {parsed['message']}")
# Extract data from logs
all_logs = " ".join(log_entries)
ip_addresses = extract_ip_addresses(all_logs)
error_codes = find_error_codes(all_logs)
print(f"\nIP addresses found: {ip_addresses}")
print(f"Error codes found: {error_codes}")
📊 Regex Flags Reference
Flag | Purpose | Example |
---|---|---|
re.IGNORECASE | Case-insensitive | re.findall(r'hello', text, re.IGNORECASE) |
re.MULTILINE | ^ and $ match line starts/ends | re.findall(r'^ERROR', text, re.MULTILINE) |
re.DOTALL | . matches newlines too | re.findall(r'start.*end', text, re.DOTALL) |
Using Regex Flags
import re
text = """INFO: Application started
ERROR: Database connection failed
WARNING: Low memory
ERROR: User authentication failed"""
# Case-insensitive search
errors_any_case = re.findall(r'error', text, re.IGNORECASE)
print(f"Errors (any case): {errors_any_case}")
# Find lines starting with ERROR
error_lines = re.findall(r'^ERROR:.*', text, re.MULTILINE)
print(f"Error lines: {error_lines}")
# Count different log levels
levels = re.findall(r'^(\w+):', text, re.MULTILINE)
level_counts = {}
for level in levels:
level_counts[level] = level_counts.get(level, 0) + 1
print(f"Log level counts: {level_counts}")
🎯 Key Takeaways
🚀 What's Next?
Learn how to parse structured markup languages like XML and HTML to extract data from web content.
Continue to: Parse XML and HTML
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.