JSON Scraping

The following walks you through scraping JSON-LD microformat data embedded in an HTML and JSON data typically found in SSR frameworks like Next.js. You may prefer this to extracting content from the raw HTML. Example of JSON-LD found embedded in an HTML:

Example JSON-LD Microformat

Scraping JSON from HTML

Lets scrape the JSON embedded in the HTML in a recipes website page. We'll use the following parameters to scrape the JSON:

Scraping JSON from HTML Using API in Python

import requests import os headers = { 'Authorization': f'Bearer {os.getenv("SPIDER_API_KEY")}', 'Content-Type': 'application/json', } params = { "url": "https://www.allrecipes.com/recipe/223312/nutella-hazelnut-cookies", "return_format": "empty", "return_json_data": True # Return the JSON data embedded in the HTML } response = requests.post( 'https://api.spider.cloud/scrape', headers=headers, json=params ) print(response.json())

Example Response

The JSON will be in the response under the other_scripts array:

Example Response JSON

{ "costs": { "ai_cost": 0, "bytes_transferred_cost": 0, "compute_cost": 0, "file_cost": 0.0005, "total_cost": 0.0005, "transform_cost": 0 }, "error": null, "json_data": { "other_scripts": [ { "@context": "http://schema.org", "@type": [ "Recipe" ], "aggregateRating": { "@type": "AggregateRating", "ratingCount": "102", "ratingValue": "4.7" }, "author": [ { "@type": "Person", "name": "Carmella DiNardo" } ], "cookTime": "PT10M", "dateModified": "2024-03-11T20:46:57.568-04:00", "datePublished": "2020-06-18T23:50:15.000-04:00", "description": "Nutella cookies made with chocolate-hazelnut spread, cocoa powder, chocolate chips, and chopped toasted hazelnuts. Perfect for Nutella lovers!", "headline": "Nutella Cookies", "image": { "@type": "ImageObject", "height": 1125, "url": "https://www.allrecipes.com/thmb/ZIU2pPfKYiMqJzOuEv2OzNaAgz8=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/223312nutella-hazelnut-cookiesKim4x3-65664615e8ff44b3aff48bdd40748b39.jpg", "width": 1500 }, "mainEntityOfPage": { "@id": "https://www.allrecipes.com/recipe/223312/nutella-hazelnut-cookies/", "@type": [ "WebPage" ], "breadcrumb": { "@type": "BreadcrumbList", "itemListElement": [ { "@type": "ListItem", "item": { "@id": "https://www.allrecipes.com/recipes/", "name": "Recipes" }, "position": 1 }, { "@type": "ListItem", "item": { "@id": "https://www.allrecipes.com/recipes/79/desserts/", "name": "Desserts" }, "position": 2 }, { "@type": "ListItem", "item": { "@id": "https://www.allrecipes.com/recipes/362/desserts/cookies/", "name": "Cookies" }, "position": 3 }, { "@type": "ListItem", "item": { "@id": "https://www.allrecipes.com/recipes/840/desserts/cookies/chocolate-cookies/", "name": "Chocolate Cookie Recipes" }, "position": 4 } ] } }, "name": "Nutella Cookies", ... }

Scrape Next.js SSR data embedded in the HTML

Using the same return_json_data parameter, we can also scrape the SSR data on Next.js pages and other similar JS frontend frameworks.

Example response for SSR, the JSON object is found in the NEXT_DATA property:

Scraping Next.js SSR Data

[ { "content": null, "costs": { "ai_cost": 0, "bytes_transferred_cost": 0, "compute_cost": 0, "file_cost": 0.0005, "total_cost": 0.0005, "transform_cost": 0 }, "error": null, "json_data": { "NEXT_DATA":"{"props":{"pageProps":{"geo":{"_id":"city:ca_san-jose","_score":36047.598,"area_type":"city","city":"San Jose","state_code":"CA","counties":[{"name":"Santa Clara","fips":"06085","state_code":"CA"}],"country":"USA","centroid":{"lon":-121.8145519,"lat":37.2960112},"slug_id":"San-Jose_CA","geo_id":"c6922355-d3ee-5c49-9144-072f956c0264","county_needed_for_uniq":false},"userAgent":"Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0","pageType":"forSale","page":1,"view":"list","filters":{"bath":{},"bed":{},"featureTags":{"tags":[]},"propertyType":{"type":[]},"listingStatus":{},"pets":{},"homeSize":{},"expand":{"radius":0,"nearbyAreas":[]},"price":{},"homeAge":{},"daysOnRealtor":{},"lotSize":{},"keyword":{"keywords":[]},"hoa":{"type":"any","fee":-1},"commute":{"address":"","time":30,"traffic":true,"transportation_type":"drive"},"showPending":false,"rentalPromotion":false,"threeDimensionalTours":false,"moveInDate":{"min":"","max":""},"featured":"","monthlyPayment":{},"builderPromotion":false},"nearbyItems":[],"properties":[{"property_id":"1143655170","list_price":6498000,"search_promotions":null,"primary_photo":{"href":"https://ap.rdcpix.com/" ... } } ]