JSON Scraping
The following walks you through scraping JSON-LD microformat data embedded in an HTML and JSON data typically found in SSR frameworks like Next.js. You may prefer this to extracting content from the raw HTML. Example of JSON-LD found embedded in an HTML:
Example JSON-LD Microformat
Scraping JSON from HTML
Lets scrape the JSON embedded in the HTML in a recipes website page. We'll use the following parameters to scrape the JSON:
Scraping JSON from HTML Using API in Python
import requests
import os
headers = {
'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}',
'Content-Type': 'application/json',
}
params = {
"url": "https://www.allrecipes.com/recipe/223312/nutella-hazelnut-cookies",
"return_format": "empty",
"return_json_data": True # Return the JSON data embedded in the HTML
}
response = requests.post(
'https://api.spider.cloud/scrape',
headers=headers,
json=params
)
print(response.json())
Example Response
The JSON will be in the response under the other_scripts
array:
Example Response JSON
{
"costs": {
"ai_cost": 0,
"bytes_transferred_cost": 0,
"compute_cost": 0,
"file_cost": 0.0005,
"total_cost": 0.0005,
"transform_cost": 0
},
"error": null,
"json_data": {
"other_scripts": [
{
"@context": "http://schema.org",
"@type": [
"Recipe"
],
"aggregateRating": {
"@type": "AggregateRating",
"ratingCount": "102",
"ratingValue": "4.7"
},
"author": [
{
"@type": "Person",
"name": "Carmella DiNardo"
}
],
"cookTime": "PT10M",
"dateModified": "2024-03-11T20:46:57.568-04:00",
"datePublished": "2020-06-18T23:50:15.000-04:00",
"description": "Nutella cookies made with chocolate-hazelnut spread, cocoa powder, chocolate chips, and chopped toasted hazelnuts. Perfect for Nutella lovers!",
"headline": "Nutella Cookies",
"image": {
"@type": "ImageObject",
"height": 1125,
"url": "https://www.allrecipes.com/thmb/ZIU2pPfKYiMqJzOuEv2OzNaAgz8=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/223312nutella-hazelnut-cookiesKim4x3-65664615e8ff44b3aff48bdd40748b39.jpg",
"width": 1500
},
"mainEntityOfPage": {
"@id": "https://www.allrecipes.com/recipe/223312/nutella-hazelnut-cookies/",
"@type": [
"WebPage"
],
"breadcrumb": {
"@type": "BreadcrumbList",
"itemListElement": [
{
"@type": "ListItem",
"item": {
"@id": "https://www.allrecipes.com/recipes/",
"name": "Recipes"
},
"position": 1
},
{
"@type": "ListItem",
"item": {
"@id": "https://www.allrecipes.com/recipes/79/desserts/",
"name": "Desserts"
},
"position": 2
},
{
"@type": "ListItem",
"item": {
"@id": "https://www.allrecipes.com/recipes/362/desserts/cookies/",
"name": "Cookies"
},
"position": 3
},
{
"@type": "ListItem",
"item": {
"@id": "https://www.allrecipes.com/recipes/840/desserts/cookies/chocolate-cookies/",
"name": "Chocolate Cookie Recipes"
},
"position": 4
}
]
}
},
"name": "Nutella Cookies",
...
}
Scrape Next.js SSR data embedded in the HTML
Using the same return_json_data
parameter, we can also scrape the SSR data on Next.js pages and other similar JS frontend frameworks.
Example response for SSR, the JSON object is found in the NEXT_DATA
property:
Scraping Next.js SSR Data
[
{
"content": null,
"costs": {
"ai_cost": 0,
"bytes_transferred_cost": 0,
"compute_cost": 0,
"file_cost": 0.0005,
"total_cost": 0.0005,
"transform_cost": 0
},
"error": null,
"json_data": {
"NEXT_DATA":"{"props":{"pageProps":{"geo":{"_id":"city:ca_san-jose","_score":36047.598,"area_type":"city","city":"San Jose","state_code":"CA","counties":[{"name":"Santa Clara","fips":"06085","state_code":"CA"}],"country":"USA","centroid":{"lon":-121.8145519,"lat":37.2960112},"slug_id":"San-Jose_CA","geo_id":"c6922355-d3ee-5c49-9144-072f956c0264","county_needed_for_uniq":false},"userAgent":"Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0","pageType":"forSale","page":1,"view":"list","filters":{"bath":{},"bed":{},"featureTags":{"tags":[]},"propertyType":{"type":[]},"listingStatus":{},"pets":{},"homeSize":{},"expand":{"radius":0,"nearbyAreas":[]},"price":{},"homeAge":{},"daysOnRealtor":{},"lotSize":{},"keyword":{"keywords":[]},"hoa":{"type":"any","fee":-1},"commute":{"address":"","time":30,"traffic":true,"transportation_type":"drive"},"showPending":false,"rentalPromotion":false,"threeDimensionalTours":false,"moveInDate":{"min":"","max":""},"featured":"","monthlyPayment":{},"builderPromotion":false},"nearbyItems":[],"properties":[{"property_id":"1143655170","list_price":6498000,"search_promotions":null,"primary_photo":{"href":"https://ap.rdcpix.com/"
...
}
}
]