diff --git a/pyproject.toml b/pyproject.toml index 8d5c8e3..4955e7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,4 +44,5 @@ dependencies = [ "bio>=1.8.0", "opencv-python>=4.11.0.86", "pypdf2>=3.0.1", + "ddgs>=9.6.1", ] diff --git a/src/agent/web_agent.py b/src/agent/web_agent.py index 3305630..6245829 100644 --- a/src/agent/web_agent.py +++ b/src/agent/web_agent.py @@ -6,6 +6,7 @@ from urllib.parse import quote_plus, urljoin, urlparse import time from .llm_provider import LLMProvider +from ddgs import DDGS class WebAgent: """Web agent for handling search queries and content extraction.""" @@ -213,108 +214,34 @@ def _search_with_firecrawl_api(self, query: str, num_results: int) -> List[Dict[ return [] def _fallback_search(self, query: str, num_results: int) -> List[Dict[str, Any]]: - """Fallback search using DuckDuckGo.""" + """Fallback search using DDGS (DuckDuckGo).""" try: - # First try DuckDuckGo instant answers API - instant_answer = self._get_duckduckgo_instant_answer(query) - if instant_answer: - return [instant_answer] - - # Fall back to basic search results - return self._scrape_search_results(query, num_results) - - except Exception as e: - print(f"Fallback search error: {e}") - return [] - - def _get_duckduckgo_instant_answer(self, query: str) -> Optional[Dict[str, Any]]: - """Get instant answers from DuckDuckGo API.""" - try: - url = f"https://api.duckduckgo.com/?q={quote_plus(query)}&format=json&no_html=1&skip_disambig=1" - response = self.session.get(url, timeout=10) - - if response.status_code == 200: - data = response.json() - - # Check for abstract (Wikipedia-style answers) - if data.get('Abstract'): - return { - 'title': data.get('Heading', query), - 'content': data['Abstract'], - 'url': data.get('AbstractURL', ''), - 'source': 'DuckDuckGo Instant Answer', - 'type': 'instant_answer' - } - - # Check for definition - if data.get('Definition'): - return { - 'title': f"Definition of {query}", - 'content': data['Definition'], - 'url': data.get('DefinitionURL', ''), - 'source': 'DuckDuckGo Definition', - 'type': 'definition' - } - - # Check for answer (direct answers) - if data.get('Answer'): - return { - 'title': query, - 'content': data['Answer'], - 'url': '', - 'source': 'DuckDuckGo Direct Answer', - 'type': 'direct_answer' - } - except Exception as e: - print(f"DuckDuckGo API error: {e}") - - return None - - def _scrape_search_results(self, query: str, num_results: int) -> List[Dict[str, Any]]: - """Scrape search results from DuckDuckGo.""" - try: - # Use DuckDuckGo HTML search - search_url = f"https://duckduckgo.com/html/?q={quote_plus(query)}" - response = self.session.get(search_url, timeout=15) - - if response.status_code != 200: - return [] - - # Simple regex-based extraction (basic implementation) - content = response.text - results = [] - - # Pattern to match search result links and titles - result_pattern = r']+href="([^"]+)"[^>]*class="result__a"[^>]*>([^<]+)' - snippet_pattern = r']+class="result__snippet"[^>]*>([^<]+)' - - links = re.findall(result_pattern, content) - snippets = re.findall(snippet_pattern, content) - - for i, (url, title) in enumerate(links[:num_results]): - # Clean up the URL (DuckDuckGo wraps URLs) - if url.startswith('/l/?uddg='): - # Extract the actual URL from DuckDuckGo's redirect - import urllib.parse - parsed = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) - actual_url = parsed.get('uddg', [''])[0] - if actual_url: - url = urllib.parse.unquote(actual_url) - - snippet = snippets[i] if i < len(snippets) else "" - - results.append({ - 'title': title.strip(), - 'content': snippet.strip(), - 'url': url, - 'source': 'Web Search', + # Use DDGS text search with multiple backend support + ddgs = DDGS(timeout=10) + results = ddgs.text( + query=query, + region="us-en", + safesearch="moderate", + max_results=num_results, + backend="auto" # Auto-selects best available backend + ) + + # Convert DDGS result format to our internal format + formatted_results = [] + for result in results: + formatted_results.append({ + 'title': result.get('title', 'No title'), + 'content': result.get('body', ''), + 'description': result.get('body', ''), + 'url': result.get('href', ''), + 'source': 'DDGS Search', 'type': 'search_result' }) - return results + return formatted_results except Exception as e: - print(f"Search scraping error: {e}") + print(f"DDGS search error: {e}") return [] def _analyze_and_enhance_results(self, results: List[Dict[str, Any]], original_query: str) -> List[Dict[str, Any]]: