Concurrent Streaming
Streaming Responses
We recommend enabling streaming responses when crawling large websites or when crawling with a high page limit. The crawler will take advantage of running in full concurrency to crawl and process pages. Streaming allows you to process each page's data as it's received, rather than waiting for the entire crawl to completeāhelping to avoid delays and reduce the risk of timeouts or data loss.
Streaming Responses in API
import requests
import jsonlines
from typing import Dict
import os
import re
def process_item(item: Dict):
url = item.get("url", "unknown_url")
status = item.get("status", 0)
content = item.get("content", {})
print(f"URL: {url}")
print(f"Status: {status}")
print(f"Content: {content}")
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/jsonl', # set jsonl as content-type
}
params = {
"url": "https://www.example.com",
"limit": 30,
"depth": 3,
"request": "smart",
"return_format": "raw"
}
response = requests.post(
'https://api.spider.cloud/crawl',
headers=headers,
json=params,
stream=True, # set to True
timeout=60
)
response.raise_for_status()
reader = jsonlines.Reader(response.raw)
for item in reader:
try:
process_item(item)
except Exception as e:
print(f"Error processing item: {e}")
continue
Streaming Responses in Python SDK
def handle_json(json_obj: dict) -> None:
assert json_obj["url"] is not None
url = "https://www.example.com"
params = {
'limit': 30,
"depth": 3,
"request": "smart",
"return_format": "markdown"
}
response = app.crawl_url(
url,
params=params,
stream=True, # set to True
callback=handle_json,
)