Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions akd/tools/resolvers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
URL Resolver package for AKD project.

This package contains various resolvers for transforming URLs to their final destinations,
such as DOI URLs, PDF URLs, or publisher URLs. It supports multiple input sources including
direct URLs, PDF URLs, and DOI identifiers from search results.
"""

# Base classes and schemas
from ._base import (
ArticleResolverConfig,
BaseArticleResolver,
ResolverInputSchema,
ResolverOutputSchema,
)

# Individual resolvers
from .ads import ADSResolver
from .arxiv import ArxivResolver
from .crossref_doi import (
CrossRefDoiResolver,
CrossRefDoiResolverConfig,
CrossRefDoiResolverInputSchema,
CrossRefDoiResolverOutputSchema,

)

# Composite resolver
from .composite import ResearchArticleResolver
from .identity import IdentityResolver

# Specialized resolvers
from .specialized import DOIResolver, PDFUrlResolver
from .unpaywall import UnpaywallResolver

__all__ = [
# Base classes and schemas
"BaseArticleResolver",
"ResolverInputSchema",
"ResolverOutputSchema",
"ArticleResolverConfig",
# Specialized resolvers
"PDFUrlResolver",
"DOIResolver",
"UnpaywallResolver",
# Individual resolvers
"IdentityResolver",
"ArxivResolver",
"ADSResolver",
# Cross ref DOI resolvers
"CrossRefDoiResolver",
"CrossRefDoiResolverConfig",
"CrossRefDoiResolverInputSchema",
"CrossRefDoiResolverOutputSchema",
# Composite resolver
"ResearchArticleResolver",

]
151 changes: 151 additions & 0 deletions akd/tools/resolvers/unpaywall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import re
from typing import Optional

import httpx
from loguru import logger
from pydantic import HttpUrl

from ._base import BaseArticleResolver, ResolverInputSchema, ResolverOutputSchema


class UnpaywallResolver(BaseArticleResolver):
"""Resolver for finding open access versions via Unpaywall API."""

def validate_url(self, url: HttpUrl | str) -> bool:
"""Check if this URL contains a DOI that can be resolved via Unpaywall."""
url_str = str(url)
# Look for DOI patterns in the URL
doi_patterns = [
r'10\.\d{4,}/[^\s"<>#]+', # Standard DOI format
r'/doi/(?:full/|pdf/|pdfdirect/)?(10\.[^/?#]+)', # DOI in path
]

for pattern in doi_patterns:
if re.search(pattern, url_str, re.IGNORECASE):
return True
return False

def _extract_doi_from_url(self, url: str) -> Optional[str]:
"""Extract DOI from URL using pattern matching."""
doi_patterns = [
r'10\.\d{4,}/[^\s"<>#]+', # Standard DOI format
r'/doi/(?:full/|pdf/|pdfdirect/)?(10\.[^/?#]+)', # DOI in path
]

for pattern in doi_patterns:
match = re.search(pattern, url, re.IGNORECASE)
if match:
# Return the full DOI or the captured group
return match.group(1) if match.groups() else match.group(0)
return None

async def resolve(self, params: ResolverInputSchema) -> ResolverOutputSchema | None:
"""
Resolve a DOI URL to its open access version via Unpaywall API.

Args:
params: ResolverInputSchema containing the URL with DOI to resolve

Returns:
ResolverOutputSchema with open access PDF URL if found, otherwise original URL
"""
url_str = str(params.url)
doi = self._extract_doi_from_url(url_str)

if not doi:
if self.debug:
logger.debug(f"No DOI found in URL: {url_str}")
return ResolverOutputSchema(
url=params.url,
title=params.title,
query=params.query,
doi=params.doi,
pdf_url=params.pdf_url,
authors=params.authors,
resolvers=[self.__class__.__name__]
)

try:
# Query Unpaywall API
unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=research@example.com"

async with httpx.AsyncClient(timeout=self.validation_timeout) as client:
response = await client.get(
unpaywall_url,
headers=self.headers
)

if response.status_code != 200:
if self.debug:
logger.debug(f"Unpaywall API returned {response.status_code} for DOI: {doi}")
return ResolverOutputSchema(
url=params.url,
title=params.title,
query=params.query,
doi=params.doi,
pdf_url=params.pdf_url,
authors=params.authors,
resolvers=[self.__class__.__name__]
)

data = response.json()

# Check if paper is open access and has a PDF URL
if data.get('is_oa', False):
best_oa_location = data.get('best_oa_location')
if best_oa_location and best_oa_location.get('url_for_pdf'):
pdf_url = best_oa_location['url_for_pdf']
if self.debug:
logger.debug(f"Found open access PDF via Unpaywall: {pdf_url}")
return ResolverOutputSchema(
url=pdf_url,
title=params.title,
query=params.query,
doi=doi,
pdf_url=pdf_url,
authors=params.authors,
resolvers=[self.__class__.__name__],
resolved_url=pdf_url
)

# Fallback to host URL if no direct PDF
if best_oa_location and best_oa_location.get('host_type') in ['publisher', 'repository']:
oa_url = best_oa_location.get('url')
if oa_url:
if self.debug:
logger.debug(f"Found open access version via Unpaywall: {oa_url}")
return ResolverOutputSchema(
url=oa_url,
title=params.title,
query=params.query,
doi=doi,
pdf_url=params.pdf_url,
authors=params.authors,
resolvers=[self.__class__.__name__],
resolved_url=oa_url
)

if self.debug:
logger.debug(f"No open access version found for DOI: {doi}")
return ResolverOutputSchema(
url=params.url,
title=params.title,
query=params.query,
doi=doi,
pdf_url=params.pdf_url,
authors=params.authors,
resolvers=[self.__class__.__name__]
)

except Exception as e:
if self.debug:
logger.debug(f"Error querying Unpaywall for DOI {doi}: {e}")
return ResolverOutputSchema(
url=params.url,
title=params.title,
query=params.query,
doi=params.doi,
pdf_url=params.pdf_url,
authors=params.authors,
resolvers=[self.__class__.__name__]
)
13 changes: 13 additions & 0 deletions akd/tools/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,15 @@
ScraperToolInputSchema,
ScraperToolOutputSchema,
)
from .composite import (
CompositeScraper,
ResearchArticleResolver,
)
from .pypaperbot_scraper import PyPaperBotScraperConfig
from .waterfall import WaterfallScraper, WaterfallScraperConfig
from .omni import DoclingScraper, DoclingScraperConfig, OmniScraperInputSchema
from .pdf_scrapers import PDFScraperInputSchema, SimplePDFScraper
from .pypaperbot_scraper import PyPaperBotScraper
from .web_scrapers import Crawl4AIWebScraper, SimpleWebScraper

__all__ = [
Expand All @@ -19,6 +26,12 @@
"DoclingScraper",
"DoclingScraperConfig",
"OmniScraperInputSchema",
"PyPaperBotScraper",
"PyPaperBotScraperConfig",
"CompositeScraper",
"WaterfallScraper",
"WaterfallScraperConfig",
"ResearchArticleResolver",
"ScrapedMetadata",
"ScraperToolBase",
"ScraperToolConfig",
Expand Down
30 changes: 21 additions & 9 deletions akd/tools/scrapers/omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,11 @@ def _setup_converter(
format_options.update(custom_options)

if self.debug:
opts_str = str(format_options)
if len(opts_str) > 100:
opts_str = f"{opts_str[:100]}..."
logger.debug(
f"Docling format options :: {format_options}",
f"Docling format options :: {opts_str}",
)

self.doc_converter = DocumentConverter(
Expand Down Expand Up @@ -221,15 +224,24 @@ async def _arun(
path = unquote(params.url.path)
try:
content, meta = await self._process_document(path)
return ScraperToolOutputSchema(content=content, metadata=meta)

if isinstance(content, str) and content.strip():
return ScraperToolOutputSchema(content=content, metadata=meta)
except FileNotFoundError as e:
# local file was missing
raise RuntimeError(f"[File Not Found] {e}")
except Exception as _e:
pass

except RuntimeError as e:
# issues during conversion
raise RuntimeError(f"[Conversion Error] {e}")
try:
# Lazy import to avoid circular dependency: pypaperbot_scraper imports omni for Docling classes
from akd.tools.scrapers.pypaperbot_scraper import (
PyPaperBotScraper, # type: ignore
)

except Exception as e:
raise RuntimeError(f"[Internal Error] Failed to scrape {path}") from e
fallback = PyPaperBotScraper(debug=self.debug)
pb_out = await fallback.arun(ScraperToolInputSchema(url=str(params.url)))
if pb_out.content and pb_out.content.strip():
return pb_out
except Exception:
pass

raise RuntimeError(f"[Internal Error] Failed to scrape {path}")
Loading
Loading