Hello World
This is a blog post with a link.
- Item 1
- Item 2
Copy-paste code for common Spider API tasks. Each recipe is complete and runnable. For full parameter details, see the API reference. For real-world applications, see Use Cases.
Crawl one or many pages from a URL. Use limit to cap the number of pages and depth to control how many link hops from the start URL. Set request: "smart" to let Spider choose the fastest strategy per page.
Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com",
"limit": 50,
"depth": 2,
"return_format": "markdown",
"request": "smart"
}
)
for page in response.json():
print(f"{page['url']} ({page['status']})")
print(page['content'][:200])
print("---")cURL
curl 'https://api.spider.cloud/crawl' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{"url": "https://example.com", "limit": 50, "depth": 2, "return_format": "markdown", "request": "smart"}'Use css_extraction_map to pull named fields from pages using CSS selectors. Map URL path patterns to arrays of selectors — Spider returns the matched content as structured key-value pairs. For AI-powered extraction with JSON Schema, see AI Studio.
Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com",
"limit": 10,
"return_format": "markdown",
"css_extraction_map": {
"/": [
{ "name": "titles", "selectors": ["h1", "h2"] },
{ "name": "paragraphs", "selectors": ["p"] },
{ "name": "links", "selectors": ["a[href]"] }
],
"/blog": [
{ "name": "article_title", "selectors": ["h1"] },
{ "name": "author", "selectors": [".author", "meta[name='author']"] },
{ "name": "body", "selectors": ["article", ".post-content"] }
]
}
}
)
for page in response.json():
print(f"{page['url']}")
extracted = page.get('extracted_data')
if extracted:
for key, values in extracted.items():
print(f" {key}: {values}")cURL
curl 'https://api.spider.cloud/crawl' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{
"url": "https://example.com",
"limit": 10,
"return_format": "markdown",
"css_extraction_map": {
"/": [
{"name": "titles", "selectors": ["h1", "h2"]},
{"name": "paragraphs", "selectors": ["p"]}
]
}
}'Take a full-page screenshot of any URL. The API returns a base64-encoded PNG that you can save directly to a file.
Python
import requests
import base64
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
response = requests.post('https://api.spider.cloud/screenshot',
headers=headers,
json={
"url": "https://example.com"
}
)
data = response.json()
with open('screenshot.png', 'wb') as f:
f.write(base64.b64decode(data[0]['content']))
print("Saved screenshot.png")cURL
curl 'https://api.spider.cloud/screenshot' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{"url": "https://example.com"}'Query search engines and optionally fetch the content of each result. Set fetch_page_content: true to get the full page content alongside search metadata.
Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
response = requests.post('https://api.spider.cloud/search',
headers=headers,
json={
"search": "web scraping best practices 2026",
"limit": 5,
"fetch_page_content": True,
"return_format": "markdown"
}
)
for result in response.json():
print(f"{result['url']}")
print(f" {result['content'][:150]}...")
print()cURL
curl 'https://api.spider.cloud/search' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{"search": "web scraping best practices 2026", "limit": 5, "fetch_page_content": true, "return_format": "markdown"}'Process pages as they finish crawling instead of waiting for the entire job. Set the Content-Type header to application/jsonl and read the response as a stream of newline-delimited JSON. See Concurrent Streaming for details.
Python
import requests
import json
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/jsonl',
}
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com",
"limit": 100,
"return_format": "markdown",
"request": "smart"
},
stream=True
)
with response as r:
r.raise_for_status()
for line in r.iter_lines(chunk_size=None, decode_unicode=True):
page = json.loads(line)
print(f"Crawled: {page['url']} ({page['status']}) - {len(page.get('content', ''))} chars")Node.js
const response = await fetch('https://api.spider.cloud/crawl', {
method: 'POST',
headers: {
'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
'Content-Type': 'application/jsonl',
},
body: JSON.stringify({
url: 'https://example.com',
limit: 100,
return_format: 'markdown',
request: 'smart',
}),
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop();
for (const line of lines) {
if (line.trim()) {
const page = JSON.parse(line);
console.log(`Crawled: ${page.url} (${page.status})`);
}
}
}Map every link on a site without downloading page content. The /links endpoint returns URLs and their HTTP status codes — useful for sitemaps, SEO audits, and finding broken links.
Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
response = requests.post('https://api.spider.cloud/links',
headers=headers,
json={
"url": "https://example.com",
"limit": 200
}
)
links = response.json()
broken = [l for l in links if l.get('status', 200) >= 400]
print(f"Found {len(links)} links, {len(broken)} broken")
for link in broken:
print(f" {link['status']} {link['url']}")cURL
curl 'https://api.spider.cloud/links' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{"url": "https://example.com", "limit": 200}'Use automation_scripts to click buttons, fill forms, and navigate before extracting content. Each action targets a URL path pattern — Spider runs the steps in order when that path is crawled. Requires request: "chrome".
Python - Login and scrape
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com/login",
"limit": 1,
"request": "chrome",
"return_format": "markdown",
"automation_scripts": {
"/login": [
{ "Fill": { "selector": "input[name='email']", "value": "user@example.com" } },
{ "Fill": { "selector": "input[name='password']", "value": "s3cret" } },
{ "Click": "button[type='submit']" },
{ "WaitForNavigation": True },
{ "Wait": 1000 }
]
}
}
)
page = response.json()[0]
print(page['content'][:500])Python - Infinite scroll + load more
# Load more content by scrolling and clicking
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com/feed",
"limit": 1,
"request": "chrome",
"return_format": "markdown",
"automation_scripts": {
"/feed": [
{ "InfiniteScroll": 5000 },
{ "WaitForAndClick": "button.load-more" },
{ "Wait": 2000 },
{ "InfiniteScroll": 3000 }
]
}
}
)
print(f"Content length: {len(response.json()[0]['content'])} chars")Use root_selector to extract only the content you want, and exclude_selector to strip out noise like navbars, footers, and ads. Works with any return format.
Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
# Extract only the article body, removing sidebars and nav
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com/blog/my-post",
"limit": 1,
"return_format": "markdown",
"root_selector": "article, main, .post-content",
"exclude_selector": "nav, footer, .sidebar, .comments, .ads"
}
)
page = response.json()[0]
print(page['content'])cURL
curl 'https://api.spider.cloud/crawl' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{
"url": "https://example.com/blog/my-post",
"limit": 1,
"return_format": "markdown",
"root_selector": "article, main, .post-content",
"exclude_selector": "nav, footer, .sidebar, .comments, .ads"
}'The /crawl endpoint already converts pages to markdown when you set return_format: "markdown". If you already have raw HTML on hand, use the /transform endpoint to convert it without crawling.
Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
html = """
Hello World
This is a blog post with a link.
- Item 1
- Item 2
"""
response = requests.post('https://api.spider.cloud/transform',
headers=headers,
json={
"data": html,
"return_format": "markdown",
"readability": True
}
)
print(response.json()[0]['content'])cURL
curl 'https://api.spider.cloud/transform' \
-H 'Authorization: Bearer YOUR_API_KEY' \
-H 'Content-Type: application/json' \
-d '{
"data": "Hello
World
",
"return_format": "markdown",
"readability": true
}'For large crawls, send results to a webhook or pipe them directly into cloud storage with data connectors. Both fire as each page finishes — no polling needed.
Python - Webhook delivery
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
# Option A: Deliver to your own endpoint via webhook
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com",
"limit": 500,
"return_format": "markdown",
"request": "smart",
"webhook": "https://your-server.com/api/spider-webhook"
}
)
print(f"Crawl started: {response.status_code}")Python - S3 data connector
# Option B: Stream directly to S3 via data connectors
response = requests.post('https://api.spider.cloud/crawl',
headers=headers,
json={
"url": "https://example.com",
"limit": 500,
"return_format": "markdown",
"request": "smart",
"data_connectors": {
"s3": {
"bucket": "my-crawl-data",
"access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
"secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
"region": "us-west-2",
"prefix": "crawls/"
},
"on_find": True
}
}
)
print(f"Crawl complete — results in S3")