Scrape & Crawl Agent with Microsoft’s Autogen
Set up an Autogen agent that scrapes and crawls websites using the Spider API.
Setup OpenAI
Get OpenAI running with the following steps:
-
Create an account and get an API Key on OpenAI.
-
Install OpenAI and set up the API key in your project as an environment variable. This approach prevents you from hardcoding the key in your code.
pip install openai
In your terminal:
export OPENAI_API_KEY=<your-api-key-here>
Alternatively, you can use the dotenv package to load the environment variables from a .env file. Create a .env file in your project root and add the following:
OPENAI_API_KEY=<your-api-key-here>
Then, in your Python code:
from dotenv import load_dotenv
from openai import OpenAI
import os
load_dotenv()
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
)
- Test OpenAI to see if things are working correctly:
import os
from openai import OpenAI
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
)
chat_completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": "What are large language models?",
}
]
)
Setup Spider & Autogen
After you get your API key, install the Spider client and Autogen. For the full API reference, see the Spider API Guide.
Install the Spider Python client library and autogen:
pip install spider_client pyautogen
Set up the Autogen LLM configuration:
import os
config_list = [
{"model": "gpt-4o", "api_key": os.getenv("OPENAI_API_KEY")},
]
And we need to set the Spider API key:
spider_api_key = os.getenv("SPIDER_API_KEY")
Creating Scrape & Crawl Functions
Import Spider to call the API:
from spider import Spider
Defining functions for the agents
Define the scrape and crawl functions the agent will call. We use the Spider SDK and default to return_format: markdown for LLM-ready data.
from typing_extensions import Annotated
from typing import List, Dict, Any
def scrape_page(url: Annotated[str, "The URL of the web page to scrape"], params: Annotated[dict, "Dictionary of additional params."] = None) -> Annotated[Dict[str, Any], "Scraped content"]:
# Initialize the Spider client with your API key, if no api key is specified it looks for SPIDER_API_KEY in your environment variables
client = Spider(spider_api_key)
if params == None:
params = {
"return_format": "markdown"
}
scraped_data = client.scrape_url(url, params)
return scraped_data[0]
def crawl_page(url: Annotated[str, "The url of the domain to be crawled"], params: Annotated[dict, "Dictionary of additional params."] = None) -> Annotated[List[Dict[str, Any]], "Scraped content"]:
# Initialize the Spider client with your API key, if no api key is specified it looks for SPIDER_API_KEY in your environment variables
client = Spider(spider_api_key)
if params == None:
params = {
"return_format": "markdown"
}
crawled_data = client.crawl_url(url, params)
return crawled_data
Create the scrape and crawl agents, and assign them the functions above:
from autogen import ConversableAgent
# Create web scraper agent.
scraper_agent = ConversableAgent(
"WebScraper",
llm_config={"config_list": config_list},
system_message="You are a web scraper and you can scrape any web page to retrieve its contents."
"Returns 'TERMINATE' when the scraping is done.",
)
# Create web crawler agent.
crawler_agent = ConversableAgent(
"WebCrawler",
llm_config={"config_list": config_list},
system_message="You are a web crawler and you can crawl any page with deeper crawling following subpages."
"Returns 'TERMINATE' when the scraping is done.",
)
How do we tell the agents to do things?
A UserProxyAgent sends messages to the other agents. See the UserProxyAgent docs for details.
user_proxy_agent = ConversableAgent(
"UserProxy",
llm_config=False, # No LLM for this agent.
human_input_mode="NEVER",
code_execution_config=False, # No code execution for this agent.
is_termination_msg=lambda x: x.get("content", "") is not None and "terminate" in x["content"].lower(),
default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.",
)
Registering the functions
Register the functions with the correct agents using register_function:
from autogen import register_function
register_function(
scrape_page,
caller=scraper_agent,
executor=user_proxy_agent,
name="scrape_page",
description="Scrape a web page and return the content.",
)
register_function(
crawl_page,
caller=crawler_agent,
executor=user_proxy_agent,
name="crawl_page",
description="Crawl an entire domain, following subpages and return the content.",
)
Using the agents
Start the conversation with user_proxy_agent, then summarize the results with Autogen’s built-in reflection_with_llm summary method:
# Scrape page
scraped_chat_result = user_proxy_agent.initiate_chat(
scraper_agent,
message="Can you scrape william-espegren.com for me?",
summary_method="reflection_with_llm",
summary_args={
"summary_prompt": """Summarize the scraped content"""
},
)
# Crawl page
crawled_chat_result = user_proxy_agent.initiate_chat(
crawler_agent,
message="Can you crawl william-espegren.com for me, I want the whole domains information?",
summary_method="reflection_with_llm",
summary_args={
"summary_prompt": """Summarize the crawled content"""
},
)
The output is stored in the summary:
print(scraped_chat_result.summary)
print(crawled_chat_result.summary)
Full code.
Two agents: one that scrapes a page and one that crawls subpages. Both can be combined with your other Autogen agents.
import os
from spider import Spider
from typing_extensions import Annotated
from typing import List, Dict, Any
from autogen import ConversableAgent
from autogen import register_function
config_list = [
{"model": "gpt-4o", "api_key": os.getenv("OPENAI_API_KEY")},
]
spider_api_key = os.getenv("SPIDER_API_KEY")
def scrape_page(url: Annotated[str, "The URL of the web page to scrape"], params: Annotated[dict, "Dictionary of additional params."] = None) -> Annotated[Dict[str, Any], "Scraped content"]:
# Initialize the Spider client with your API key, if no api key is specified it looks for SPIDER_API_KEY in your environment variables
client = Spider(spider_api_key)
if params == None:
params = {
"return_format": "markdown"
}
scraped_data = client.scrape_url(url, params)
return scraped_data[0]
def crawl_page(url: Annotated[str, "The url of the domain to be crawled"], params: Annotated[dict, "Dictionary of additional params."] = None) -> Annotated[List[Dict[str, Any]], "Scraped content"]:
# Initialize the Spider client with your API key, if no api key is specified it looks for SPIDER_API_KEY in your environment variables
client = Spider(spider_api_key)
if params == None:
params = {
"return_format": "markdown"
}
crawled_data = client.crawl_url(url, params)
return crawled_data
# Create web scraper agent.
scraper_agent = ConversableAgent(
"WebScraper",
llm_config={"config_list": config_list},
system_message="You are a web scraper and you can scrape any web page to retrieve its contents."
"Returns 'TERMINATE' when the scraping is done.",
)
# Create web crawler agent.
crawler_agent = ConversableAgent(
"WebCrawler",
llm_config={"config_list": config_list},
system_message="You are a web crawler and you can crawl any page with deeper crawling following subpages."
"Returns 'TERMINATE' when the scraping is done.",
)
user_proxy_agent = ConversableAgent(
"UserProxy",
llm_config=False, # No LLM for this agent.
human_input_mode="NEVER",
code_execution_config=False, # No code execution for this agent.
is_termination_msg=lambda x: x.get("content", "") is not None and "terminate" in x["content"].lower(),
default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.",
)
register_function(
scrape_page,
caller=scraper_agent,
executor=user_proxy_agent,
name="scrape_page",
description="Scrape a web page and return the content.",
)
register_function(
crawl_page,
caller=crawler_agent,
executor=user_proxy_agent,
name="crawl_page",
description="Crawl an entire domain, following subpages and return the content.",
)
# Scrape page
scraped_chat_result = user_proxy_agent.initiate_chat(
scraper_agent,
message="Can you scrape william-espegren.com for me?",
summary_method="reflection_with_llm",
summary_args={
"summary_prompt": """Summarize the scraped content"""
},
)
print(scraped_chat_result.summary)
If you liked this guide, consider checking out me and Spider on Twitter:
- Author Twitter: @WilliamEspegren
- Spider Twitter: @spider_rust
- Related Article: Dify + Spider Workflow by: Tomihide Kaketani
Empower any project with AI-ready data
Join thousands of developers using Spider to power their data pipelines.