🌐 Parsing XML and HTML
XML and HTML are structured markup languages used for data storage and web content. Python provides powerful tools to parse these formats and extract the information you need.
import xml.etree.ElementTree as ET
# Sample XML data
xml_data = """
<books>
<book id="1">
<title>Python Programming</title>
<author>John Doe</author>
<price>29.99</price>
</book>
<book id="2">
<title>Web Development</title>
<author>Jane Smith</author>
<price>34.99</price>
</book>
</books>
"""
# Parse XML
root = ET.fromstring(xml_data)
# Extract book information
for book in root.findall('book'):
book_id = book.get('id')
title = book.find('title').text
author = book.find('author').text
price = book.find('price').text
print(f"Book {book_id}: {title} by {author} - ${price}")
🎯 Understanding XML and HTML Parsing
XML and HTML have tree-like structures that can be navigated to extract specific data elements.
Basic XML Parsing
import xml.etree.ElementTree as ET
# XML configuration example
config_xml = """
<config>
<database>
<host>localhost</host>
<port>5432</port>
<name>myapp</name>
</database>
<settings>
<debug>true</debug>
<timeout>30</timeout>
</settings>
</config>
"""
# Parse and extract configuration
root = ET.fromstring(config_xml)
# Get database settings
db_host = root.find('database/host').text
db_port = int(root.find('database/port').text)
db_name = root.find('database/name').text
# Get application settings
debug_mode = root.find('settings/debug').text == 'true'
timeout = int(root.find('settings/timeout').text)
print(f"Database: {db_host}:{db_port}/{db_name}")
print(f"Debug: {debug_mode}, Timeout: {timeout}s")
Working with XML Attributes
import xml.etree.ElementTree as ET
# XML with attributes
products_xml = """
<catalog>
<product id="101" category="electronics">
<name>Laptop</name>
<price currency="USD">999.99</price>
<stock>5</stock>
</product>
<product id="102" category="books">
<name>Python Guide</name>
<price currency="USD">39.99</price>
<stock>20</stock>
</product>
</catalog>
"""
root = ET.fromstring(products_xml)
# Process products with attributes
for product in root.findall('product'):
# Get attributes
product_id = product.get('id')
category = product.get('category')
# Get element text
name = product.find('name').text
price_element = product.find('price')
price = float(price_element.text)
currency = price_element.get('currency')
stock = int(product.find('stock').text)
print(f"Product {product_id} ({category}):")
print(f" {name}: {price} {currency} (Stock: {stock})")
📋 XML Parsing Methods Reference
Method | Purpose | Example |
---|---|---|
ET.fromstring() | Parse XML string | root = ET.fromstring(xml_text) |
ET.parse() | Parse XML file | tree = ET.parse('file.xml') |
find() | Find first matching element | element.find('tag') |
findall() | Find all matching elements | element.findall('tag') |
get() | Get attribute value | element.get('attribute') |
.text | Get element text content | element.text |
🔧 HTML Parsing with Built-in Tools
Simple HTML Parser
from html.parser import HTMLParser
class SimpleHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.links = []
self.current_tag = None
self.content = []
def handle_starttag(self, tag, attrs):
self.current_tag = tag
if tag == 'a':
# Extract href attribute from links
for attr_name, attr_value in attrs:
if attr_name == 'href':
self.links.append(attr_value)
def handle_data(self, data):
if self.current_tag in ['title', 'h1', 'h2', 'p']:
self.content.append(data.strip())
def handle_endtag(self, tag):
self.current_tag = None
# Sample HTML
html_content = """
<html>
<head>
<title>Sample Page</title>
</head>
<body>
<h1>Welcome</h1>
<p>This is a paragraph with <a href="https://example.com">a link</a>.</p>
<p>Another paragraph with <a href="/page2">internal link</a>.</p>
</body>
</html>
"""
# Parse HTML
parser = SimpleHTMLParser()
parser.feed(html_content)
print("Links found:")
for link in parser.links:
print(f" {link}")
print("\nContent found:")
for content in parser.content:
if content: # Skip empty strings
print(f" {content}")
HTML Data Extraction
import re
def extract_html_data(html):
"""Simple HTML data extraction using regex (for basic cases)"""
# Extract title
title_match = re.search(r'<title>(.*?)</title>', html, re.IGNORECASE)
title = title_match.group(1) if title_match else "No title"
# Extract all links
links = re.findall(r'<a[^>]+href=[\'"](.*?)[\'"][^>]*>', html, re.IGNORECASE)
# Extract text content (remove HTML tags)
text_content = re.sub(r'<[^>]+>', '', html)
# Clean up whitespace
text_content = re.sub(r'\s+', ' ', text_content).strip()
return {
'title': title,
'links': links,
'text': text_content[:200] + '...' if len(text_content) > 200 else text_content
}
# Sample HTML
sample_html = """
<html>
<head><title>Product Page</title></head>
<body>
<h1>Amazing Product</h1>
<p>This product is <strong>fantastic</strong>!</p>
<a href="/buy-now">Buy Now</a>
<a href="mailto:support@example.com">Contact Support</a>
</body>
</html>
"""
extracted_data = extract_html_data(sample_html)
print(f"Title: {extracted_data['title']}")
print(f"Links: {extracted_data['links']}")
print(f"Text: {extracted_data['text']}")
🏗️ Creating XML Documents
Building XML with ElementTree
import xml.etree.ElementTree as ET
def create_xml_report(data):
# Create root element
root = ET.Element("report")
root.set("generated", "2024-01-15")
# Add summary section
summary = ET.SubElement(root, "summary")
ET.SubElement(summary, "total_items").text = str(len(data))
ET.SubElement(summary, "total_value").text = str(sum(item['price'] for item in data))
# Add items section
items = ET.SubElement(root, "items")
for item_data in data:
item = ET.SubElement(items, "item")
item.set("id", str(item_data['id']))
ET.SubElement(item, "name").text = item_data['name']
ET.SubElement(item, "price").text = str(item_data['price'])
ET.SubElement(item, "category").text = item_data['category']
return root
# Sample data
sample_data = [
{'id': 1, 'name': 'Laptop', 'price': 999.99, 'category': 'Electronics'},
{'id': 2, 'name': 'Book', 'price': 29.99, 'category': 'Education'},
{'id': 3, 'name': 'Coffee Mug', 'price': 12.99, 'category': 'Kitchen'}
]
# Create XML report
xml_root = create_xml_report(sample_data)
# Convert to string for display
xml_string = ET.tostring(xml_root, encoding='unicode')
print("Generated XML:")
print(xml_string)
🎯 Key Takeaways
🚀 What's Next?
Learn how to fetch and process data from web APIs to integrate external services into your applications.
Continue to: Handle API Requests
Was this helpful?
Track Your Learning Progress
Sign in to bookmark tutorials and keep track of your learning journey.
Your progress is saved automatically as you read.