The Trafilatura tool provides advanced web content extraction capabilities, allowing AI agents to extract clean, structured text from web pages while removing boilerplate content like navigation, ads, and footers.
Overview
Trafilatura is a Python library and command-line tool designed to extract meaningful content from web pages. It focuses on main text extraction, metadata parsing, and content quality assessment, making it ideal for creating clean datasets from web sources.
Installation
Install the required dependencies:
pip install trafilatura lxml
For enhanced extraction capabilities, install additional dependencies:
pip install trafilatura[all]
Core Functions
Extracts clean text content from web pages with various configuration options.
from praisonaiagents.tools import trafilatura_extract
# Basic extraction from URL
content = trafilatura_extract(
url="https://example.com/article",
output_format="text"
)
# Extract with metadata
result = trafilatura_extract(
url="https://example.com/article",
include_metadata=True,
output_format="json"
)
print(result['title'])
print(result['author'])
print(result['date'])
print(result['text'])
Extract content from raw HTML string.
from praisonaiagents.tools import trafilatura_extract_from_html
html_content = "<html>...</html>"
extracted = trafilatura_extract_from_html(
html=html_content,
include_comments=False,
include_tables=True
)
Usage Examples
from praisonaiagents import Agent, Task
from praisonaiagents.tools import trafilatura_extract
# Create content extraction agent
extractor_agent = Agent(
name="Content Extractor",
instructions="Extract and analyze web content",
tools=[trafilatura_extract]
)
# Extract article content
task = Task(
description="Extract the main content from https://example.com/blog-post",
agent=extractor_agent
)
result = task.execute()
print(result)
from praisonaiagents.tools import trafilatura_extract
# Extract with all options
content = trafilatura_extract(
url="https://example.com/article",
output_format="xml", # xml, json, or text
include_metadata=True,
include_comments=True,
include_tables=True,
include_images=True,
include_links=True,
deduplicate=True,
language="en", # Target language
min_length=100, # Minimum text length
max_length=10000 # Maximum text length
)
Batch Processing Multiple URLs
from praisonaiagents.tools import trafilatura_extract
import concurrent.futures
urls = [
"https://example.com/article1",
"https://example.com/article2",
"https://example.com/article3"
]
def extract_url(url):
try:
return {
'url': url,
'content': trafilatura_extract(url, include_metadata=True)
}
except Exception as e:
return {'url': url, 'error': str(e)}
# Parallel extraction
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(extract_url, urls))
# Process results
for result in results:
if 'error' not in result:
print(f"Extracted from {result['url']}: {len(result['content']['text'])} characters")
Configuration Options
# Full configuration example
extraction_config = {
'url': 'https://example.com/article',
'output_format': 'json', # 'text', 'json', 'xml'
'include_metadata': True, # Extract title, author, date, etc.
'include_comments': False, # Include comment sections
'include_tables': True, # Preserve table structures
'include_images': True, # Extract image information
'include_links': True, # Preserve hyperlinks
'deduplicate': True, # Remove duplicate content
'language': 'en', # Target language for extraction
'min_length': 25, # Minimum paragraph length
'max_length': 100000, # Maximum content length
'timeout': 30, # Request timeout in seconds
'user_agent': 'Mozilla/5.0 (PraisonAI)', # Custom user agent
}
result = trafilatura_extract(**extraction_config)
Language Detection and Filtering
from praisonaiagents.tools import trafilatura_extract
# Extract only English content
english_content = trafilatura_extract(
url="https://multilingual-site.com/page",
language="en",
language_threshold=0.9 # Confidence threshold
)
# Auto-detect language
content_with_lang = trafilatura_extract(
url="https://example.com/article",
detect_language=True,
output_format="json"
)
print(f"Detected language: {content_with_lang['language']}")
# Define custom extraction rules
custom_rules = {
'xpath_expressions': {
'title': '//h1[@class="article-title"]',
'author': '//span[@class="author-name"]',
'content': '//div[@class="article-body"]'
},
'css_selectors': {
'title': 'h1.article-title',
'author': 'span.author-name',
'content': 'div.article-body'
},
'remove_selectors': [
'div.advertisement',
'aside.sidebar',
'div.related-articles'
]
}
# Apply custom rules
content = trafilatura_extract(
url="https://example.com/article",
custom_rules=custom_rules
)
Integration with AI Agents
Content Analysis Pipeline
from praisonaiagents import Agent, Task, Process
from praisonaiagents.tools import trafilatura_extract
# Content extractor
extractor = Agent(
name="Web Extractor",
instructions="Extract main content from web pages",
tools=[trafilatura_extract]
)
# Content analyzer
analyzer = Agent(
name="Content Analyzer",
instructions="Analyze extracted content for key insights"
)
# Summarizer
summarizer = Agent(
name="Summarizer",
instructions="Create concise summaries of extracted content"
)
# Tasks
extract_task = Task(
description="Extract content from https://example.com/important-article",
agent=extractor
)
analyze_task = Task(
description="Analyze the extracted content for main themes and key points",
agent=analyzer
)
summarize_task = Task(
description="Create a 3-paragraph summary of the content",
agent=summarizer
)
# Process
content_pipeline = Process(
agents=[extractor, analyzer, summarizer],
tasks=[extract_task, analyze_task, summarize_task]
)
result = content_pipeline.run()
Research Assistant
from praisonaiagents import Agent, Task
from praisonaiagents.tools import trafilatura_extract, searxng_search
research_agent = Agent(
name="Research Assistant",
instructions="""You are a research assistant that:
1. Searches for relevant sources
2. Extracts content from found URLs
3. Compiles comprehensive research reports""",
tools=[searxng_search, trafilatura_extract]
)
research_task = Task(
description="Research recent developments in quantum computing, extract content from top 5 sources, and compile a report",
agent=research_agent
)
report = research_task.execute()
Advanced Features
Content Quality Assessment
def assess_content_quality(url):
content = trafilatura_extract(
url=url,
include_metadata=True,
output_format="json"
)
if not content:
return {"quality": "low", "reason": "No content extracted"}
text_length = len(content.get('text', ''))
has_metadata = all(content.get(field) for field in ['title', 'author', 'date'])
quality_score = {
'text_length': text_length,
'has_metadata': has_metadata,
'quality': 'high' if text_length > 500 and has_metadata else 'medium'
}
return quality_score
Incremental Web Scraping
import time
from datetime import datetime
def incremental_scrape(url_list, checkpoint_file='scrape_checkpoint.json'):
import json
# Load checkpoint
try:
with open(checkpoint_file, 'r') as f:
checkpoint = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
checkpoint = {'processed': [], 'last_run': None}
results = []
for url in url_list:
if url in checkpoint['processed']:
continue
try:
content = trafilatura_extract(url, include_metadata=True)
results.append({
'url': url,
'content': content,
'extracted_at': datetime.now().isoformat()
})
checkpoint['processed'].append(url)
# Save checkpoint after each successful extraction
checkpoint['last_run'] = datetime.now().isoformat()
with open(checkpoint_file, 'w') as f:
json.dump(checkpoint, f)
time.sleep(1) # Rate limiting
except Exception as e:
print(f"Error extracting {url}: {e}")
return results
Content Deduplication
from praisonaiagents.tools import trafilatura_extract
import hashlib
def extract_unique_content(urls):
seen_hashes = set()
unique_content = []
for url in urls:
content = trafilatura_extract(url, output_format="text")
if content:
# Create content hash
content_hash = hashlib.md5(content.encode()).hexdigest()
if content_hash not in seen_hashes:
seen_hashes.add(content_hash)
unique_content.append({
'url': url,
'content': content,
'hash': content_hash
})
else:
print(f"Duplicate content found: {url}")
return unique_content
Best Practices
- Rate Limiting: Always implement delays between requests to avoid overwhelming servers
- Error Handling: Wrap extraction calls in try-except blocks
- Content Validation: Verify extracted content meets minimum quality standards
- Metadata Preservation: Always extract metadata when available
- Language Filtering: Use language detection for multilingual sites
- Caching: Cache extracted content to avoid redundant requests
- User Agent: Set appropriate user agent strings
import asyncio
import aiohttp
from praisonaiagents.tools import trafilatura_extract_from_html
async def fetch_and_extract(session, url):
try:
async with session.get(url) as response:
html = await response.text()
content = trafilatura_extract_from_html(
html=html,
include_metadata=True
)
return {'url': url, 'content': content}
except Exception as e:
return {'url': url, 'error': str(e)}
async def batch_extract_async(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_and_extract(session, url) for url in urls]
return await asyncio.gather(*tasks)
# Usage
urls = ["https://example1.com", "https://example2.com"]
results = asyncio.run(batch_extract_async(urls))
Memory-Efficient Processing
def process_large_html_file(file_path):
"""Process large HTML files in chunks"""
from praisonaiagents.tools import trafilatura_extract_from_html
chunk_size = 1024 * 1024 # 1MB chunks
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read(chunk_size)
while html_content:
try:
extracted = trafilatura_extract_from_html(html_content)
yield extracted
except Exception:
pass
html_content = f.read(chunk_size)
Troubleshooting
Common Issues and Solutions
-
Empty Extraction Results
# Fallback extraction strategies
def robust_extract(url):
strategies = [
{'include_comments': False, 'deduplicate': True},
{'min_length': 10, 'include_tables': True},
{'output_format': 'xml', 'include_images': True}
]
for strategy in strategies:
content = trafilatura_extract(url, **strategy)
if content and len(str(content)) > 100:
return content
return None
-
Encoding Issues
# Handle various encodings
def extract_with_encoding_detection(url):
import chardet
import requests
response = requests.get(url)
detected = chardet.detect(response.content)
encoding = detected['encoding']
html = response.content.decode(encoding)
return trafilatura_extract_from_html(html)
-
JavaScript-Heavy Sites
# For JS-rendered content, combine with browser automation
from selenium import webdriver
def extract_js_content(url):
driver = webdriver.Chrome()
driver.get(url)
# Wait for content to load
time.sleep(3)
html = driver.page_source
driver.quit()
return trafilatura_extract_from_html(html)
Feature | Trafilatura | BeautifulSoup | Readability |
---|
Main content extraction | ✅ Excellent | ⚡ Manual | ✅ Good |
Metadata extraction | ✅ Automatic | ❌ Manual | ⚡ Limited |
Language detection | ✅ Built-in | ❌ No | ❌ No |
Speed | ✅ Fast | ⚡ Medium | ⚡ Medium |
Boilerplate removal | ✅ Excellent | ❌ Manual | ✅ Good |
Table preservation | ✅ Yes | ✅ Yes | ❌ Limited |
For more information about web scraping and content extraction patterns, see the Web Automation documentation.
Responses are generated using AI and may contain mistakes.