Recipes

Copy-paste code for common Spider API tasks. Each recipe is complete and runnable. For full parameter details, see the API reference. For real-world applications, see Use Cases.

Crawl a Website

Crawl one or many pages from a URL. Use limit to cap the number of pages and depth to control how many link hops from the start URL. Set request: "smart" to let Spider choose the fastest strategy per page.

Python

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com",
    "limit": 50,
    "depth": 2,
    "return_format": "markdown",
    "request": "smart"
  }
)

for page in response.json():
    print(f"{page['url']} ({page['status']})")
    print(page['content'][:200])
    print("---")

cURL

curl 'https://api.spider.cloud/crawl' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{"url": "https://example.com", "limit": 50, "depth": 2, "return_format": "markdown", "request": "smart"}'

Extract Structured Data

Use css_extraction_map to pull named fields from pages using CSS selectors. Map URL path patterns to arrays of selectors — Spider returns the matched content as structured key-value pairs. For AI-powered extraction with JSON Schema, see AI Studio.

Python

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com",
    "limit": 10,
    "return_format": "markdown",
    "css_extraction_map": {
      "/": [
        { "name": "titles", "selectors": ["h1", "h2"] },
        { "name": "paragraphs", "selectors": ["p"] },
        { "name": "links", "selectors": ["a[href]"] }
      ],
      "/blog": [
        { "name": "article_title", "selectors": ["h1"] },
        { "name": "author", "selectors": [".author", "meta[name='author']"] },
        { "name": "body", "selectors": ["article", ".post-content"] }
      ]
    }
  }
)

for page in response.json():
    print(f"{page['url']}")
    extracted = page.get('extracted_data')
    if extracted:
        for key, values in extracted.items():
            print(f"  {key}: {values}")

cURL

curl 'https://api.spider.cloud/crawl' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{
    "url": "https://example.com",
    "limit": 10,
    "return_format": "markdown",
    "css_extraction_map": {
      "/": [
        {"name": "titles", "selectors": ["h1", "h2"]},
        {"name": "paragraphs", "selectors": ["p"]}
      ]
    }
  }'

Capture a Screenshot

Take a full-page screenshot of any URL. The API returns a base64-encoded PNG that you can save directly to a file.

Python

import requests
import base64
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

response = requests.post('https://api.spider.cloud/screenshot',
  headers=headers,
  json={
    "url": "https://example.com"
  }
)

data = response.json()
with open('screenshot.png', 'wb') as f:
    f.write(base64.b64decode(data[0]['content']))

print("Saved screenshot.png")

cURL

curl 'https://api.spider.cloud/screenshot' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{"url": "https://example.com"}'

Search the Web

Query search engines and optionally fetch the content of each result. Set fetch_page_content: true to get the full page content alongside search metadata.

Python

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

response = requests.post('https://api.spider.cloud/search',
  headers=headers,
  json={
    "search": "web scraping best practices 2026",
    "limit": 5,
    "fetch_page_content": True,
    "return_format": "markdown"
  }
)

for result in response.json():
    print(f"{result['url']}")
    print(f"  {result['content'][:150]}...")
    print()

cURL

curl 'https://api.spider.cloud/search' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{"search": "web scraping best practices 2026", "limit": 5, "fetch_page_content": true, "return_format": "markdown"}'

Stream Results in Real-Time

Process pages as they finish crawling instead of waiting for the entire job. Set the Content-Type header to application/jsonl and read the response as a stream of newline-delimited JSON. See Concurrent Streaming for details.

Python

import requests
import json
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/jsonl',
}

response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com",
    "limit": 100,
    "return_format": "markdown",
    "request": "smart"
  },
  stream=True
)

with response as r:
    r.raise_for_status()
    for line in r.iter_lines(chunk_size=None, decode_unicode=True):
        page = json.loads(line)
        print(f"Crawled: {page['url']} ({page['status']}) - {len(page.get('content', ''))} chars")

Node.js

const response = await fetch('https://api.spider.cloud/crawl', {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${process.env.SPIDER_API_KEY}`,
    'Content-Type': 'application/jsonl',
  },
  body: JSON.stringify({
    url: 'https://example.com',
    limit: 100,
    return_format: 'markdown',
    request: 'smart',
  }),
});

const reader = response.body.getReader();
const decoder = new TextDecoder();
let buffer = '';

while (true) {
  const { done, value } = await reader.read();
  if (done) break;

  buffer += decoder.decode(value, { stream: true });
  const lines = buffer.split('\n');
  buffer = lines.pop();

  for (const line of lines) {
    if (line.trim()) {
      const page = JSON.parse(line);
      console.log(`Crawled: ${page.url} (${page.status})`);
    }
  }
}

Discover All Links

Map every link on a site without downloading page content. The /links endpoint returns URLs and their HTTP status codes — useful for sitemaps, SEO audits, and finding broken links.

Python

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

response = requests.post('https://api.spider.cloud/links',
  headers=headers,
  json={
    "url": "https://example.com",
    "limit": 200
  }
)

links = response.json()
broken = [l for l in links if l.get('status', 200) >= 400]

print(f"Found {len(links)} links, {len(broken)} broken")
for link in broken:
    print(f"  {link['status']} {link['url']}")

cURL

curl 'https://api.spider.cloud/links' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{"url": "https://example.com", "limit": 200}'

Automate Browser Interactions

Use automation_scripts to click buttons, fill forms, and navigate before extracting content. Each action targets a URL path pattern — Spider runs the steps in order when that path is crawled. Requires request: "chrome".

Python - Login and scrape

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com/login",
    "limit": 1,
    "request": "chrome",
    "return_format": "markdown",
    "automation_scripts": {
      "/login": [
        { "Fill": { "selector": "input[name='email']", "value": "user@example.com" } },
        { "Fill": { "selector": "input[name='password']", "value": "s3cret" } },
        { "Click": "button[type='submit']" },
        { "WaitForNavigation": True },
        { "Wait": 1000 }
      ]
    }
  }
)

page = response.json()[0]
print(page['content'][:500])

Python - Infinite scroll + load more

# Load more content by scrolling and clicking
response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com/feed",
    "limit": 1,
    "request": "chrome",
    "return_format": "markdown",
    "automation_scripts": {
      "/feed": [
        { "InfiniteScroll": 5000 },
        { "WaitForAndClick": "button.load-more" },
        { "Wait": 2000 },
        { "InfiniteScroll": 3000 }
      ]
    }
  }
)

print(f"Content length: {len(response.json()[0]['content'])} chars")

Target Content with CSS Selectors

Use root_selector to extract only the content you want, and exclude_selector to strip out noise like navbars, footers, and ads. Works with any return format.

Python

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

# Extract only the article body, removing sidebars and nav
response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com/blog/my-post",
    "limit": 1,
    "return_format": "markdown",
    "root_selector": "article, main, .post-content",
    "exclude_selector": "nav, footer, .sidebar, .comments, .ads"
  }
)

page = response.json()[0]
print(page['content'])

cURL

curl 'https://api.spider.cloud/crawl' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{
    "url": "https://example.com/blog/my-post",
    "limit": 1,
    "return_format": "markdown",
    "root_selector": "article, main, .post-content",
    "exclude_selector": "nav, footer, .sidebar, .comments, .ads"
  }'

Transform HTML to Markdown

The /crawl endpoint already converts pages to markdown when you set return_format: "markdown". If you already have raw HTML on hand, use the /transform endpoint to convert it without crawling.

Python

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

html = """

  
    Site nav...
    
      Hello World
      This is a blog post with a link.
      Item 1
Item 2
    
    Copyright 2026
  

"""

response = requests.post('https://api.spider.cloud/transform',
  headers=headers,
  json={
    "data": html,
    "return_format": "markdown",
    "readability": True
  }
)

print(response.json()[0]['content'])

cURL

curl 'https://api.spider.cloud/transform' \
  -H 'Authorization: Bearer YOUR_API_KEY' \
  -H 'Content-Type: application/json' \
  -d '{
    "data": "Hello
World",
    "return_format": "markdown",
    "readability": true
  }'

Deliver Results Asynchronously

For large crawls, send results to a webhook or pipe them directly into cloud storage with data connectors. Both fire as each page finishes — no polling needed.

Python - Webhook delivery

import requests
import os

headers = {
    'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
    'Content-Type': 'application/json',
}

# Option A: Deliver to your own endpoint via webhook
response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com",
    "limit": 500,
    "return_format": "markdown",
    "request": "smart",
    "webhook": "https://your-server.com/api/spider-webhook"
  }
)

print(f"Crawl started: {response.status_code}")

Python - S3 data connector

# Option B: Stream directly to S3 via data connectors
response = requests.post('https://api.spider.cloud/crawl',
  headers=headers,
  json={
    "url": "https://example.com",
    "limit": 500,
    "return_format": "markdown",
    "request": "smart",
    "data_connectors": {
      "s3": {
        "bucket": "my-crawl-data",
        "access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
        "secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
        "region": "us-west-2",
        "prefix": "crawls/"
      },
      "on_find": True
    }
  }
)

print(f"Crawl complete — results in S3")

Pro Tip:

These recipes use the REST API directly. Spider also has official client libraries for Python, Node.js, Rust, and more that handle authentication, retries, and streaming automatically.