Crawl4AI Praison AI Integration
Install
pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git" transformers torch nltk
pip install praisonai
export OPENAI_API_KEY=xxxxxxxxx
import os
from crawl4ai import WebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token Γfor the OpenAI model.")
url = 'https://openai.com/api/pricing/'
crawler = WebCrawler()
crawler.warmup()
result = crawler.run(
url=url,
word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy(
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
schema=OpenAIModelFee.schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content. One extracted model JSON format should look like this:
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
),
bypass_cache=True,
)
print(result.extracted_content)
agents.yaml
framework: crewai
topic: extract model pricing from websites
roles:
web_scraper:
backstory: An expert in web scraping with a deep understanding of extracting structured
data from online sources. https://openai.com/api/pricing/ https://www.anthropic.com/pricing https://cohere.com/pricing
goal: Gather model pricing data from various websites
role: Web Scraper
tasks:
scrape_model_pricing:
description: Scrape model pricing information from the provided list of websites.
expected_output: Raw HTML or JSON containing model pricing data.
tools:
- 'ModelFeeTool'
dependencies: []
Run