diff --git a/README.md b/README.md index 3172ace..7f69429 100755 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ ddgs --help | DDGS function | Available backends | | --------------|:-------------------| -| text() | `bing`, `brave`, `duckduckgo`, `google`, `grokipedia`, `mojeek`, `yandex`, `yahoo`, `wikipedia`| +| text() | `bing`, `brave`, `duckduckgo`, `google`, `grokipedia`, `sogou`, `mojeek`, `yandex`, `yahoo`, `wikipedia`| | images() | `duckduckgo` | | videos() | `duckduckgo` | | news() | `bing`, `duckduckgo`, `yahoo` | diff --git a/ddgs/cli.py b/ddgs/cli.py index 0d5aa86..7e2b5d1 100644 --- a/ddgs/cli.py +++ b/ddgs/cli.py @@ -191,6 +191,7 @@ def version() -> str: "google", "grokipedia", "mojeek", + "sogou", "yandex", "yahoo", "wikipedia", diff --git a/ddgs/engines/sogou.py b/ddgs/engines/sogou.py new file mode 100644 index 0000000..f52479c --- /dev/null +++ b/ddgs/engines/sogou.py @@ -0,0 +1,94 @@ +"""Sogou search engine implementation.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar +from urllib.parse import urljoin + +if TYPE_CHECKING: + from collections.abc import Mapping + +from ddgs.base import BaseSearchEngine +from ddgs.results import TextResult + + +class Sogou(BaseSearchEngine[TextResult]): + """Sogou search engine.""" + + name = "sogou" + category = "text" + provider = "sogou" + + search_url = "https://www.sogou.com/web" + search_method = "GET" + + items_xpath = "//div[contains(@class, 'vrwrap') and not(contains(@class, 'hint'))]" + elements_xpath: ClassVar[Mapping[str, str]] = { + "title": ".//h3//a//text()", + "href": ".//h3//a/@href", + "body": ".//div[contains(@class, 'space-txt')]//text()", + } + + _data_url_xpath = ".//*[@data-url]/@data-url" + + def build_payload( + self, + query: str, + region: str, # noqa: ARG002 + safesearch: str, # noqa: ARG002 + timelimit: str | None, + page: int = 1, + **kwargs: str, # noqa: ARG002 + ) -> dict[str, Any]: + """Build a payload for the search request.""" + payload = {"query": query, "ie": "utf8", "p": "40040100", "dp": "1"} + if timelimit: + payload["tsn"] = {"d": "1", "w": "7", "m": "30", "y": "365"}[timelimit] + if page > 1: + payload["page"] = str(page) + return payload + + @staticmethod + def _xpath_join(item: Any, xpath: str) -> str: # noqa: ANN401 + return " ".join(x.strip() for x in item.xpath(xpath) if x and x.strip()) + + @staticmethod + def _xpath_first(item: Any, xpath: str) -> str: # noqa: ANN401 + for value in item.xpath(xpath): + if value and (str_value := value.strip()): + return str_value + return "" + + @staticmethod + def _is_wrapper_link(href: str) -> bool: + return "/link?url=" in href or "sogou.com/link?url=" in href + + def extract_results(self, html_text: str) -> list[TextResult]: + """Extract search results from html text.""" + html_text = self.pre_process_html(html_text) + tree = self.extract_tree(html_text) + items = tree.xpath(self.items_xpath) + results = [] + for item in items: + title = self._xpath_join(item, self.elements_xpath["title"]) + href = self._xpath_first(item, self.elements_xpath["href"]) + body = self._xpath_join(item, self.elements_xpath["body"]) + data_url = self._xpath_first(item, self._data_url_xpath) + if href and self._is_wrapper_link(href) and data_url.startswith(("http://", "https://")): + href = data_url + results.append(TextResult(title=title, href=href, body=body)) + return results + + def post_extract_results(self, results: list[TextResult]) -> list[TextResult]: + """Post-process search results.""" + post_results = [] + for result in results: + if not (result.href and result.title): + continue + + href = urljoin(self.search_url, result.href) + if self._is_wrapper_link(href): + continue + + post_results.append(TextResult(title=result.title, href=href, body=result.body)) + return post_results diff --git a/tests/sogou_test.py b/tests/sogou_test.py new file mode 100644 index 0000000..5d3b892 --- /dev/null +++ b/tests/sogou_test.py @@ -0,0 +1,44 @@ +from ddgs.engines.sogou import Sogou +from ddgs.http_client import Response + + +def test_sogou_uses_data_url_to_avoid_extra_requests() -> None: + engine = Sogou() + + def fail_request(*_args: object, **_kwargs: object) -> Response: # noqa: ARG001 + raise AssertionError("Unexpected request while resolving wrapper links") + + engine.http_client.request = fail_request # type: ignore[method-assign] + html_text = """ + +
+

Example title

+
Example body
+
+
+ + """ + + results = engine.post_extract_results(engine.extract_results(html_text)) + assert len(results) == 1 + assert results[0].href == "https://example.com/target" + + +def test_sogou_resolves_wrapper_link_when_data_url_missing() -> None: + engine = Sogou() + + def fake_request(*args: object, **kwargs: object) -> Response: + raise AssertionError("Unexpected request while resolving wrapper links") + + engine.http_client.request = fake_request # type: ignore[method-assign] + html_text = """ + +
+

Example title

+
Example body
+
+ + """ + + results = engine.post_extract_results(engine.extract_results(html_text)) + assert results == []