🌐 Parsing XML and HTML

XML and HTML are structured markup languages used for data storage and web content. Python provides powerful tools to parse these formats and extract the information you need.

import xml.etree.ElementTree as ET

# Sample XML data
xml_data = """
<books>
    <book id="1">
        <title>Python Programming</title>
        <author>John Doe</author>
        <price>29.99</price>
    </book>
    <book id="2">
        <title>Web Development</title>
        <author>Jane Smith</author>
        <price>34.99</price>
    </book>
</books>
"""

# Parse XML
root = ET.fromstring(xml_data)

# Extract book information
for book in root.findall('book'):
    book_id = book.get('id')
    title = book.find('title').text
    author = book.find('author').text
    price = book.find('price').text
    
    print(f"Book {book_id}: {title} by {author} - ${price}")

🎯 Understanding XML and HTML Parsing

XML and HTML have tree-like structures that can be navigated to extract specific data elements.

Basic XML Parsing

import xml.etree.ElementTree as ET

# XML configuration example
config_xml = """
<config>
    <database>
        <host>localhost</host>
        <port>5432</port>
        <name>myapp</name>
    </database>
    <settings>
        <debug>true</debug>
        <timeout>30</timeout>
    </settings>
</config>
"""

# Parse and extract configuration
root = ET.fromstring(config_xml)

# Get database settings
db_host = root.find('database/host').text
db_port = int(root.find('database/port').text)
db_name = root.find('database/name').text

# Get application settings
debug_mode = root.find('settings/debug').text == 'true'
timeout = int(root.find('settings/timeout').text)

print(f"Database: {db_host}:{db_port}/{db_name}")
print(f"Debug: {debug_mode}, Timeout: {timeout}s")

Working with XML Attributes

import xml.etree.ElementTree as ET

# XML with attributes
products_xml = """
<catalog>
    <product id="101" category="electronics">
        <name>Laptop</name>
        <price currency="USD">999.99</price>
        <stock>5</stock>
    </product>
    <product id="102" category="books">
        <name>Python Guide</name>
        <price currency="USD">39.99</price>
        <stock>20</stock>
    </product>
</catalog>
"""

root = ET.fromstring(products_xml)

# Process products with attributes
for product in root.findall('product'):
    # Get attributes
    product_id = product.get('id')
    category = product.get('category')
    
    # Get element text
    name = product.find('name').text
    price_element = product.find('price')
    price = float(price_element.text)
    currency = price_element.get('currency')
    stock = int(product.find('stock').text)
    
    print(f"Product {product_id} ({category}):")
    print(f"  {name}: {price} {currency} (Stock: {stock})")

📋 XML Parsing Methods Reference

Method	Purpose	Example
`ET.fromstring()`	Parse XML string	`root = ET.fromstring(xml_text)`
`ET.parse()`	Parse XML file	`tree = ET.parse('file.xml')`
`find()`	Find first matching element	`element.find('tag')`
`findall()`	Find all matching elements	`element.findall('tag')`
`get()`	Get attribute value	`element.get('attribute')`
`.text`	Get element text content	`element.text`

🔧 HTML Parsing with Built-in Tools

Simple HTML Parser

from html.parser import HTMLParser

class SimpleHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.links = []
        self.current_tag = None
        self.content = []
    
    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        if tag == 'a':
            # Extract href attribute from links
            for attr_name, attr_value in attrs:
                if attr_name == 'href':
                    self.links.append(attr_value)
    
    def handle_data(self, data):
        if self.current_tag in ['title', 'h1', 'h2', 'p']:
            self.content.append(data.strip())
    
    def handle_endtag(self, tag):
        self.current_tag = None

# Sample HTML
html_content = """
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <h1>Welcome</h1>
    <p>This is a paragraph with <a href="https://example.com">a link</a>.</p>
    <p>Another paragraph with <a href="/page2">internal link</a>.</p>
</body>
</html>
"""

# Parse HTML
parser = SimpleHTMLParser()
parser.feed(html_content)

print("Links found:")
for link in parser.links:
    print(f"  {link}")

print("\nContent found:")
for content in parser.content:
    if content:  # Skip empty strings
        print(f"  {content}")

HTML Data Extraction

import re

def extract_html_data(html):
    """Simple HTML data extraction using regex (for basic cases)"""
    
    # Extract title
    title_match = re.search(r'<title>(.*?)</title>', html, re.IGNORECASE)
    title = title_match.group(1) if title_match else "No title"
    
    # Extract all links
    links = re.findall(r'<a[^>]+href=[\'"](.*?)[\'"][^>]*>', html, re.IGNORECASE)
    
    # Extract text content (remove HTML tags)
    text_content = re.sub(r'<[^>]+>', '', html)
    # Clean up whitespace
    text_content = re.sub(r'\s+', ' ', text_content).strip()
    
    return {
        'title': title,
        'links': links,
        'text': text_content[:200] + '...' if len(text_content) > 200 else text_content
    }

# Sample HTML
sample_html = """
<html>
<head><title>Product Page</title></head>
<body>
    <h1>Amazing Product</h1>
    <p>This product is <strong>fantastic</strong>!</p>
    <a href="/buy-now">Buy Now</a>
    <a href="mailto:support@example.com">Contact Support</a>
</body>
</html>
"""

extracted_data = extract_html_data(sample_html)
print(f"Title: {extracted_data['title']}")
print(f"Links: {extracted_data['links']}")
print(f"Text: {extracted_data['text']}")

🏗️ Creating XML Documents

Building XML with ElementTree

import xml.etree.ElementTree as ET

def create_xml_report(data):
    # Create root element
    root = ET.Element("report")
    root.set("generated", "2024-01-15")
    
    # Add summary section
    summary = ET.SubElement(root, "summary")
    ET.SubElement(summary, "total_items").text = str(len(data))
    ET.SubElement(summary, "total_value").text = str(sum(item['price'] for item in data))
    
    # Add items section
    items = ET.SubElement(root, "items")
    
    for item_data in data:
        item = ET.SubElement(items, "item")
        item.set("id", str(item_data['id']))
        
        ET.SubElement(item, "name").text = item_data['name']
        ET.SubElement(item, "price").text = str(item_data['price'])
        ET.SubElement(item, "category").text = item_data['category']
    
    return root

# Sample data
sample_data = [
    {'id': 1, 'name': 'Laptop', 'price': 999.99, 'category': 'Electronics'},
    {'id': 2, 'name': 'Book', 'price': 29.99, 'category': 'Education'},
    {'id': 3, 'name': 'Coffee Mug', 'price': 12.99, 'category': 'Kitchen'}
]

# Create XML report
xml_root = create_xml_report(sample_data)

# Convert to string for display
xml_string = ET.tostring(xml_root, encoding='unicode')
print("Generated XML:")
print(xml_string)

🎯 Key Takeaways

🚀 What's Next?

Learn how to fetch and process data from web APIs to integrate external services into your applications.

Continue to: Handle API Requests

Online Python

🌐 Parsing XML and HTML

Track Your Learning Progress