Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ celerybeat.pid
# Environments
.env
.envrc
.venv
.venv*
env/
venv/
ENV/
Expand Down
34 changes: 18 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,23 @@ A metasearch library that aggregates results from diverse web search services.


## Table of Contents
* [Install](#install)
* [CLI version](#cli-version)
* [DDGS search operators](#ddgs-search-operators)
* [Regions](#regions)
* [Engines](#engines)
* [Tips](#tips)
* [DDGS class](#ddgs-class)
* [Proxy](#proxy)
* [Exceptions](#exceptions)
* [1. text()](#1-text)
* [2. images()](#2-images)
* [3. videos()](#3-videos)
* [4. news()](#4-news)
* [5. books()](#5-books)
* [Disclaimer](#disclaimer)
- [DDGS | Dux Distributed Global Search](#ddgs--dux-distributed-global-search)
- [Table of Contents](#table-of-contents)
- [Install](#install)
- [CLI version](#cli-version)
- [DDGS search operators](#ddgs-search-operators)
- [Regions](#regions)
- [Engines](#engines)
- [Tips](#tips)
- [DDGS class](#ddgs-class)
- [Proxy](#proxy)
- [Exceptions](#exceptions)
- [1. text()](#1-text)
- [2. images()](#2-images)
- [3. videos()](#3-videos)
- [4. news()](#4-news)
- [5. books()](#5-books)
- [Disclaimer](#disclaimer)

## Install
```python
Expand Down Expand Up @@ -156,7 +158,7 @@ ___

| DDGS function | Available backends |
| --------------|:-------------------|
| text() | `bing`, `brave`, `duckduckgo`, `google`, `mojeek`, `yandex`, `yahoo`, `wikipedia`|
| text() | `bing`, `brave`, `duckduckgo`, `google`, `mojeek`, `sogou`, `yandex`, `yahoo`, `wikipedia`|
| images() | `duckduckgo` |
| videos() | `duckduckgo` |
| news() | `bing`, `duckduckgo`, `yahoo` |
Expand Down
1 change: 1 addition & 0 deletions ddgs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def version() -> str:
"duckduckgo",
"google",
"mojeek",
"sogou",
"yandex",
"yahoo",
"wikipedia",
Expand Down
89 changes: 89 additions & 0 deletions ddgs/engines/sogou.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Sogou search engine implementation."""

from __future__ import annotations
Copy link
Owner

@deedy5 deedy5 Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please delete from __future__ import annotations


import logging
import re
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import urljoin

if TYPE_CHECKING:
from collections.abc import Mapping

from ddgs.base import BaseSearchEngine
from ddgs.results import TextResult

logger = logging.getLogger(__name__)


class Sogou(BaseSearchEngine[TextResult]):
"""Sogou search engine."""

name = "sogou"
category = "text"
provider = "sogou"

search_url = "https://www.sogou.com/web"
search_method = "GET"

items_xpath = "//div[contains(@class, 'vrwrap') and not(contains(@class, 'hint'))]"
elements_xpath: ClassVar[Mapping[str, str]] = {
"title": ".//h3//a//text()",
"href": ".//h3//a/@href",
Copy link
Owner

@deedy5 deedy5 Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"href": use xpath from _data_url_xpath

"body": ".//div[contains(@class, 'space-txt')]//text()",
}

_redirect_pattern = re.compile(r"window\.location\.replace\([\"'](?P<url>[^\"']+)[\"']\)")
_meta_refresh_pattern = re.compile(r"URL='?(?P<url>[^'\"]+)", re.IGNORECASE)

def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool | str = True) -> None:
super().__init__(proxy=proxy, timeout=timeout, verify=verify)
self._href_cache: dict[str, str] = {}

def build_payload(
self,
query: str,
region: str, # noqa: ARG002
safesearch: str, # noqa: ARG002
timelimit: str | None,
page: int = 1,
**kwargs: str, # noqa: ARG002
) -> dict[str, Any]:
"""Build a payload for the search request."""
payload = {"query": query, "ie": "utf8", "p": "40040100", "dp": "1"}
if timelimit:
payload["tsn"] = {"d": "1", "w": "7", "m": "30", "y": "365"}[timelimit]
if page > 1:
payload["page"] = str(page)
return payload

def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
Copy link
Owner

@deedy5 deedy5 Dec 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate href in post_extract_results

"""Post-process search results."""
post_results = []
for result in results:
if result.href and result.title:
result.href = self._normalize_href(result.href)
post_results.append(result)
return post_results

def _normalize_href(self, href: str) -> str:
"""Normalize Sogou link to an absolute URL and resolve redirects when possible."""
href = urljoin(self.search_url, href)
if "sogou.com/link?url=" not in href:
return href

if href in self._href_cache:
return self._href_cache[href]

resolved = href
try:
resp = self.http_client.request("GET", href)
except Exception as exc: # noqa: BLE001
logger.debug("Error resolving Sogou link %s: %r", href, exc)
else:
if resp.status_code == 200 and resp.text:
match = self._redirect_pattern.search(resp.text) or self._meta_refresh_pattern.search(resp.text)
if match:
resolved = match.group("url")
self._href_cache[href] = resolved
return resolved