Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions recipe_scrapers/_schemaorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# find a package that parses https://schema.org/Recipe properly (or create one ourselves).
from __future__ import annotations

import json
import re
from itertools import chain

import extruct
Expand All @@ -23,6 +25,50 @@


class SchemaOrg:
@staticmethod
def _normalize_nextjs_jsonld(html: str) -> str:
"""Lift JSON-LD from Next.js App Router __next_s.push() injections.

Next.js App Router SSR pages inject JSON-LD via:

(self.__next_s=self.__next_s||[]).push([0,
{"type":"application/ld+json","children":"{...}"}
])

instead of a static <script type="application/ld+json"> tag, so extruct
misses it entirely. This method detects the pattern and injects a proper
script tag before the HTML reaches extruct.

Returns *html* unchanged when the pattern is absent (fast path).
"""
if "__next_s" not in html or "application/ld+json" not in html:
return html

injected: list[str] = []
for script_body in re.findall(r"<script[^>]*>(.*?)</script>", html, re.DOTALL):
if "__next_s" not in script_body or "application/ld+json" not in script_body:
continue
m = re.search(r"push\(\[0,(\{.*\})\]\)", script_body, re.DOTALL)
if not m:
continue
try:
obj = json.loads(m.group(1))
if obj.get("type") == "application/ld+json" and "children" in obj:
injected.append(obj["children"])
except (json.JSONDecodeError, KeyError):
continue

if not injected:
return html

extra = "".join(
f'<script type="application/ld+json">{content}</script>'
for content in injected
)
if "</head>" in html:
return html.replace("</head>", extra + "</head>", 1)
return extra + html

@staticmethod
def _contains_schematype(item, schematype):
itemtype = item.get("@type", "")
Expand All @@ -40,6 +86,7 @@ def _find_entity(self, item, schematype):
return node

def __init__(self, page_data):
page_data = self._normalize_nextjs_jsonld(page_data)
self.format = None
self.data = {}
self.people = {}
Expand Down
24 changes: 24 additions & 0 deletions tests/library/test_schemaorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,27 @@ def test_best_image_setting_toggle(self):
)
finally:
settings.BEST_IMAGE_SELECTION = original

def test_nextjs_ssr_jsonld(self):
"""JSON-LD injected via Next.js __next_s.push() must be detected."""
import json as _json

ld = {
"@context": "https://schema.org",
"@type": "Recipe",
"name": "Test Next.js Recipe",
"recipeIngredient": ["500g fish"],
"recipeInstructions": ["Cook the fish."],
}
outer = _json.dumps(
{"type": "application/ld+json", "async": True, "children": _json.dumps(ld)}
)
page_data = (
"<html><head><title>Test</title>"
f"<script>(self.__next_s=self.__next_s||[]).push([0,{outer}])</script>"
"</head><body></body></html>"
)
parser = SchemaOrg(page_data)
self.assertEqual("Test Next.js Recipe", parser.title())
self.assertIn("500g fish", parser.ingredients())
self.assertEqual("Cook the fish.", parser.instructions())