Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
## v0.9.29 (2026-01-14)

### Fix

- add timeout reset
- improve requests and cooldown
- correct playwright installation
- correct dockerignore and entrypoint

### Refactor

- add ruff formatting
- improve timeout logic
- improve linking procedure
- update dependencies
- start improving requests

## v0.9.28 (2026-01-12)

### Fix
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Strain Authentication and Identification Methods - saim

[![release: 0.9.28](https://img.shields.io/badge/rel-0.9.28-blue.svg?style=flat-square)](https://github.com/LeibnizDSMZ/saim.git)
[![release: 0.9.29](https://img.shields.io/badge/rel-0.9.29-blue.svg?style=flat-square)](https://github.com/LeibnizDSMZ/saim.git)
[![MIT LICENSE](https://img.shields.io/badge/License-MIT-brightgreen.svg?style=flat-square)](https://choosealicense.com/licenses/mit/)
[![Documentation Status](https://img.shields.io/badge/docs-GitHub-blue.svg?style=flat-square)](https://LeibnizDSMZ.github.io/saim/)

Expand Down
6 changes: 5 additions & 1 deletion lefthook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,14 @@ pre-commit:
group:
piped: true
jobs:
- name: ruff
- name: ruff-check
glob: "*.py"
run: ruff check {staged_files} --fix
stage_fixed: true
- name: ruff-format
glob: "*.py"
run: ruff format {staged_files}
stage_fixed: true
- name: uv-export
run: make runLock && git add \*/requirements\*\.txt
stage_fixed: true
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "saim"
version = "0.9.28"
version = "0.9.29"
description = """A library for identifying strains."""
readme = "README.md"
authors = [
Expand Down Expand Up @@ -28,7 +28,7 @@ dependencies = [

[project.scripts]
extract_ccno = "saim.designation.main:run"
verify_cafi = "saim.culture_link.main:run"
verify_links = "saim.culture_link.main:run"

[tool.setuptools.packages.find]
where = ["src"]
Expand Down Expand Up @@ -250,7 +250,7 @@ parallel = true

[tool.coverage.report]
show_missing = false
fail_under = 50
fail_under = 30

[tool.vulture]
paths = ["configs/dev/whitelist.py", "src"]
Expand Down
165 changes: 71 additions & 94 deletions src/saim/culture_link/private/cached_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@
Mapping,
ParamSpec,
final,
override,
)
import warnings
from requests import PreparedRequest, Timeout
from requests.structures import CaseInsensitiveDict
from requests.adapters import HTTPAdapter, BaseAdapter
from requests_cache import AnyResponse, BaseCache, CachedSession
from playwright.async_api import (Response, async_playwright, Error,
BrowserContext, Playwright, Page)
from requests.adapters import BaseAdapter
from requests_cache import BaseCache, CachedSession
from playwright.async_api import (
Response,
async_playwright,
Error,
BrowserContext,
Playwright,
Page,
)
from urllib3 import HTTPResponse
from requests.models import Response as RequestResponse
from requests.exceptions import RequestException
Expand All @@ -41,9 +46,7 @@ async def _get_resp(
resp = await call()
except Error as err:
warnings.warn(
f"{retry!s} - {err!s} - {resp!s} - {err_str}",
RequestWarn,
stacklevel=0
f"{retry!s} - {err!s} - {resp!s} - {err_str}", RequestWarn, stacklevel=0
)
return resp

Expand Down Expand Up @@ -136,18 +139,6 @@ def close(self, last: bool) -> None:
runner.close()


@final
class SimpleHTTPAdapter(HTTPAdapter):

@override
def close(self) -> None:
pass

def finish(self) -> None:
print("CLOSING HTTP")
super(HTTPAdapter, self).close()


BLOCK_TYPES: Final[list[str]] = [
"image",
"media",
Expand All @@ -163,19 +154,23 @@ class BrowserPWAdapter(BaseAdapter):
__slots__: tuple[str, ...] = (
"__browser",
"__contact",
"__cool_down",
"__delay",
"__pwc",
"__retries",
"__runner",
"__tmp"
"__tmp",
)

def __init__(
self, pwc: PWContext, contact: str = "", max_retries: int = 0, /
self, pwc: PWContext, contact: str = "", max_attempts: int = 1, /
) -> None:
self.__retries = max_retries if max_retries > 1 else 1
self.__pwc: PWContext = pwc
self.__contact = contact
self.__tmp = tempfile.TemporaryDirectory()
self.__cool_down: CoolDownDomain | None = None
self.__delay = 1.0
self.__retries = max_attempts if max_attempts > 1 else 1
if not self.__pwc.is_test:
ctx = self.__pwc.runner.run(self.__pwc.ctx)
self.__browser: BrowserContext | None = self.__pwc.runner.run(
Expand All @@ -192,6 +187,16 @@ def __init__(
self.__browser = None
super().__init__()

def set_cool_down(self, cool_down: CoolDownDomain, delay: float, /) -> None:
self.__cool_down = cool_down
self.__delay = delay

async def __await_cool_down(self) -> None:
if self.__cool_down is None:
await asyncio.sleep(1.0)
else:
self.__cool_down.await_cool_down(self.__delay)

async def __send(
self,
url: str,
Expand All @@ -211,18 +216,21 @@ async def __send(
"**/*",
lambda route, req: (
route.abort()
if req.resource_type in BLOCK_TYPES else route.continue_()
if req.resource_type in BLOCK_TYPES
else route.continue_()
),
)
page.on("console", lambda _: None)
await page.set_extra_http_headers(
{"User-Agent": get_user_agent(self.__contact)}
)
att_time = tout_msec * (0.5 if attempt > 0 else 1.0)

async def go_to_page(p: Page = page, t: float = att_time) -> Response | None:
return await p.goto(url, timeout=t, wait_until="load")

resp: Response | None = await _get_resp(go_to_page, err_str, attempt +1)
await self.__await_cool_down()
resp: Response | None = await _get_resp(go_to_page, err_str, attempt + 1)
if resp is not None:
start_time = time.time()
try:
Expand All @@ -239,7 +247,7 @@ async def go_to_page(p: Page = page, t: float = att_time) -> Response | None:
return _create_response(request, resp, content)
await page.close()
if attempt + 1 < self.__retries:
await asyncio.sleep(1.0 + (random.random() - 0.5)) # noqa: S311
await asyncio.sleep(1.0 + (random.random() - 0.5)) # noqa: S311
return None

def send(
Expand Down Expand Up @@ -272,18 +280,11 @@ def finish(self) -> None:
self.__tmp.cleanup()


def _mount_adapters(
adapter_pw: BrowserPWAdapter | SimpleHTTPAdapter, session: CachedSession, /
) -> None:
session.mount("http://", adapter_pw)
session.mount("https://", adapter_pw)


P = ParamSpec("P")


def create_get_cache(
adapter: BrowserPWAdapter | SimpleHTTPAdapter,
def _create_get_cache(
adapter: BrowserPWAdapter,
exp_days: int,
backend: BaseCache,
key_fn: Callable[Concatenate[PreparedRequest, P], str],
Expand All @@ -297,83 +298,59 @@ def create_get_cache(
stale_if_error=False,
always_revalidate=False,
allowable_codes=[*range(200, 400), 404, 403],
allowable_methods=(
"GET",
"HEAD",
),
allowable_methods=("GET",),
key_fn=key_fn,
)
_mount_adapters(adapter, session)
session.mount("http://", adapter)
session.mount("https://", adapter)
except Error as cex:
raise SessionCreationEx(f"{cex!s}") from cex
return session


def run_request(browser: bool, session: CachedSession, /) -> Callable[..., AnyResponse]:
if browser:
return session.get
return session.head


def _browser_fallback_wrap(
browser: bool,
pw_adapter: BrowserPWAdapter,
session: CachedSession,
url: str,
contact: str,
/,
) -> AnyResponse:
params = {
"timeout": 180,
"allow_redirects": True,
"headers": {"User-Agent": get_user_agent(contact)},
}
try:
response = run_request(browser, session)(url, **params)
except (Error, RequestException):
if not browser:
_mount_adapters(pw_adapter, session)
response = session.get(url, **params)
else:
raise
else:
if 400 <= response.status_code < 500 and not browser:
response = session.get(url, **params)
return response


def make_get_request(
browser: bool,
pw_adapter: BrowserPWAdapter,
url: str,
session: CachedSession,
domain_info: tuple[CoolDownDomain, RobotsTxt],
contact: str,
session: tuple[
BrowserPWAdapter,
int,
BaseCache,
Callable[Concatenate[PreparedRequest, P], str],
],
info: tuple[CoolDownDomain, RobotsTxt, str],
tasks_cnt: int,
/,
) -> CachedPageResp:
results = CachedPageResp(prohibited=True)
cool_down, robots_txt = domain_info
cool_down, robots_txt, contact = info
pw_adapter, exp, cache, call = session

pw_adapter.set_cool_down(cool_down, robots_txt.get_delay())
cached_session = _create_get_cache(
pw_adapter,
exp,
cache,
call,
)

def _callback(last_request: float, /) -> tuple[float, bool]:
if last_request < 0:
return last_request, True
nonlocal results
request_time = time.time()
if robots_txt.can_fetch(url):
if cool_down.skip_request():
return results
try:
response = _browser_fallback_wrap(browser, pw_adapter, session, url, contact)
if response.from_cache:
request_time = last_request
response = cached_session.get(
url,
**{
"timeout": 180,
"allow_redirects": True,
"headers": {"User-Agent": get_user_agent(contact)},
},
)
except (Error, RequestException):
results = CachedPageResp(timeout=True)
return request_time, True
cool_down.finished_request(True, tasks_cnt)
return CachedPageResp(timeout=True)
results = CachedPageResp(
response=b"" if response.content is None else response.content,
status=response.status_code,
cached=response.from_cache,
)
return request_time, False

if robots_txt.can_fetch(url):
delay = robots_txt.get_delay()
cool_down.call_after_cool_down(delay, _callback)
cool_down.finished_request(results.timeout, tasks_cnt)
return results
39 changes: 20 additions & 19 deletions src/saim/culture_link/private/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,31 +56,32 @@ class TaskPackage:
template_links: CatalogueLink
fallback_link: str = ""

@property
def urls(self) -> list[tuple[str, str, str, int]]:
return [
*[
(
LinkLevel.cat.value,
*self._pack_catalogue(cat),
)
for cat in self.template_links.catalogue
],
("fallback", *self._pack_catalogue(self.fallback_link)),
(
LinkLevel.home.value,
self.template_links.homepage,
str(CacheNames.hom.value),
HOME_EXP_DAYS,
),
]

def _pack_catalogue(self, link: str, /) -> tuple[str, str, int]:
if len(self.search_task.find_extra) == 0:
return (link, str(CacheNames.cat.value), CAT_EXP_DAYS)
return (link, str(CacheNames.cat_det.value), CAT_DET_EXP_DAYS)

def __iter__(self) -> Iterator[tuple[str, str, str, int]]:
return iter(
task
for task in [
*[
(
LinkLevel.cat.value,
*self._pack_catalogue(cat),
)
for cat in self.template_links.catalogue
],
("fallback", *self._pack_catalogue(self.fallback_link)),
(
LinkLevel.home.value,
self.template_links.homepage,
str(CacheNames.hom.value),
HOME_EXP_DAYS,
),
]
)
return iter(task for task in self.urls)


@final
Expand Down
Loading