-
Notifications
You must be signed in to change notification settings - Fork 435
feat: Update RecoverableState
and StorageInstanceManager
to ensure proper persistence
#1368
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,7 @@ def __init__( | |
metadata: RequestQueueMetadata, | ||
storage_dir: Path, | ||
lock: asyncio.Lock, | ||
recoverable_state: RecoverableState[RequestQueueState], | ||
) -> None: | ||
"""Initialize a new instance. | ||
|
||
|
@@ -112,13 +113,7 @@ def __init__( | |
self._is_empty_cache: bool | None = None | ||
"""Cache for is_empty result: None means unknown, True/False is cached state.""" | ||
|
||
self._state = RecoverableState[RequestQueueState]( | ||
default_state=RequestQueueState(), | ||
persist_state_key='request_queue_state', | ||
persistence_enabled=True, | ||
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}', | ||
logger=logger, | ||
) | ||
self._state = recoverable_state | ||
"""Recoverable state to maintain request ordering, in-progress status, and handled status.""" | ||
|
||
@override | ||
|
@@ -187,14 +182,9 @@ async def open( | |
metadata = RequestQueueMetadata(**file_content) | ||
|
||
if metadata.id == id: | ||
client = cls( | ||
metadata=metadata, | ||
storage_dir=storage_dir, | ||
lock=asyncio.Lock(), | ||
client = await cls._create_client( | ||
metadata=metadata, storage_dir=storage_dir, update_accessed_at=True | ||
) | ||
await client._state.initialize() | ||
await client._discover_existing_requests() | ||
await client._update_metadata(update_accessed_at=True) | ||
found = True | ||
break | ||
finally: | ||
|
@@ -224,15 +214,7 @@ async def open( | |
|
||
metadata.name = name | ||
|
||
client = cls( | ||
metadata=metadata, | ||
storage_dir=storage_dir, | ||
lock=asyncio.Lock(), | ||
) | ||
|
||
await client._state.initialize() | ||
await client._discover_existing_requests() | ||
await client._update_metadata(update_accessed_at=True) | ||
client = await cls._create_client(metadata=metadata, storage_dir=storage_dir, update_accessed_at=True) | ||
|
||
# Otherwise, create a new dataset client. | ||
else: | ||
|
@@ -248,13 +230,40 @@ async def open( | |
pending_request_count=0, | ||
total_request_count=0, | ||
) | ||
client = cls( | ||
metadata=metadata, | ||
storage_dir=storage_dir, | ||
lock=asyncio.Lock(), | ||
) | ||
await client._state.initialize() | ||
await client._update_metadata() | ||
client = await cls._create_client(metadata=metadata, storage_dir=storage_dir) | ||
|
||
return client | ||
|
||
@classmethod | ||
async def _create_client( | ||
cls, metadata: RequestQueueMetadata, storage_dir: Path, *, update_accessed_at: bool = False | ||
) -> FileSystemRequestQueueClient: | ||
"""Create client from metadata and storage directory.""" | ||
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular imports | ||
from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415 avoid circular imports | ||
|
||
# Prepare kvs for recoverable state | ||
kvs_client = await FileSystemStorageClient().create_kvs_client(name=f'__RQ_STATE_{metadata.id}') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wait a minute. This is a subtle bug waiting to happen - the Can you pass the actual storage client down to this method instead? |
||
kvs_client_metadata = await kvs_client.get_metadata() | ||
kvs = KeyValueStore(client=kvs_client, id=kvs_client_metadata.id, name=kvs_client_metadata.name) | ||
|
||
# Create state | ||
recoverable_state = RecoverableState[RequestQueueState]( | ||
default_state=RequestQueueState(), | ||
persist_state_key='request_queue_state', | ||
persistence_enabled=True, | ||
logger=logger, | ||
key_value_store=kvs, | ||
) | ||
|
||
# Create client | ||
client = cls( | ||
metadata=metadata, storage_dir=storage_dir, lock=asyncio.Lock(), recoverable_state=recoverable_state | ||
) | ||
|
||
await client._state.initialize() | ||
await client._discover_existing_requests() | ||
await client._update_metadata(update_accessed_at=update_accessed_at) | ||
|
||
return client | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,17 @@ | ||
from __future__ import annotations | ||
|
||
from collections import defaultdict | ||
from collections.abc import Awaitable, Callable | ||
from dataclasses import dataclass, field | ||
from typing import TYPE_CHECKING, TypeVar, cast | ||
|
||
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient | ||
|
||
from ._base import Storage | ||
|
||
if TYPE_CHECKING: | ||
from crawlee.configuration import Configuration | ||
|
||
from ._base import Storage | ||
|
||
T = TypeVar('T', bound='Storage') | ||
|
||
StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient | ||
|
@@ -19,6 +21,22 @@ | |
"""Type alias for the client opener function.""" | ||
|
||
|
||
@dataclass | ||
class _StorageClientCache: | ||
"""Cache for specific storage client.""" | ||
|
||
by_id: defaultdict[type[Storage], defaultdict[str, Storage]] = field( | ||
default_factory=lambda: defaultdict(lambda: defaultdict()) | ||
) | ||
"""Cache for storage instances by ID, separated by storage type.""" | ||
by_name: defaultdict[type[Storage], defaultdict[str, Storage]] = field( | ||
default_factory=lambda: defaultdict(lambda: defaultdict()) | ||
) | ||
"""Cache for storage instances by name, separated by storage type.""" | ||
default_instances: defaultdict[type[Storage], Storage] = field(default_factory=lambda: defaultdict()) | ||
"""Cache for default instances of each storage type.""" | ||
|
||
|
||
class StorageInstanceManager: | ||
"""Manager for caching and managing storage instances. | ||
|
||
|
@@ -27,14 +45,7 @@ class StorageInstanceManager: | |
""" | ||
|
||
def __init__(self) -> None: | ||
self._cache_by_id = dict[type[Storage], dict[str, Storage]]() | ||
"""Cache for storage instances by ID, separated by storage type.""" | ||
|
||
self._cache_by_name = dict[type[Storage], dict[str, Storage]]() | ||
"""Cache for storage instances by name, separated by storage type.""" | ||
|
||
self._default_instances = dict[type[Storage], Storage]() | ||
"""Cache for default instances of each storage type.""" | ||
self._cache_by_storage_client: dict[str, _StorageClientCache] = defaultdict(_StorageClientCache) | ||
|
||
async def open_storage_instance( | ||
self, | ||
|
@@ -64,19 +75,23 @@ async def open_storage_instance( | |
raise ValueError('Only one of "id" or "name" can be specified, not both.') | ||
|
||
# Check for default instance | ||
if id is None and name is None and cls in self._default_instances: | ||
return cast('T', self._default_instances[cls]) | ||
if ( | ||
id is None | ||
and name is None | ||
and cls in self._cache_by_storage_client[client_opener.__qualname__].default_instances | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's no guarantee that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, with the current state, we do not have access to the storage_client before caching, so I was left with client_opener. I was thinking about accessing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess that that's another consideration related to the instance cache mechanism. We should resolve that before v1. |
||
): | ||
return cast('T', self._cache_by_storage_client[client_opener.__qualname__].default_instances[cls]) | ||
|
||
# Check cache | ||
if id is not None: | ||
type_cache_by_id = self._cache_by_id.get(cls, {}) | ||
type_cache_by_id = self._cache_by_storage_client[client_opener.__qualname__].by_id[cls] | ||
if id in type_cache_by_id: | ||
cached_instance = type_cache_by_id[id] | ||
if isinstance(cached_instance, cls): | ||
return cached_instance | ||
|
||
if name is not None: | ||
type_cache_by_name = self._cache_by_name.get(cls, {}) | ||
type_cache_by_name = self._cache_by_storage_client[client_opener.__qualname__].by_name[cls] | ||
if name in type_cache_by_name: | ||
cached_instance = type_cache_by_name[name] | ||
if isinstance(cached_instance, cls): | ||
|
@@ -90,16 +105,13 @@ async def open_storage_instance( | |
instance_name = getattr(instance, 'name', None) | ||
|
||
# Cache the instance | ||
type_cache_by_id = self._cache_by_id.setdefault(cls, {}) | ||
type_cache_by_name = self._cache_by_name.setdefault(cls, {}) | ||
|
||
type_cache_by_id[instance.id] = instance | ||
self._cache_by_storage_client[client_opener.__qualname__].by_id[cls][instance.id] = instance | ||
if instance_name is not None: | ||
type_cache_by_name[instance_name] = instance | ||
self._cache_by_storage_client[client_opener.__qualname__].by_name[cls][instance_name] = instance | ||
|
||
# Set as default if no id/name specified | ||
if id is None and name is None: | ||
self._default_instances[cls] = instance | ||
self._cache_by_storage_client[client_opener.__qualname__].default_instances[cls] = instance | ||
|
||
return instance | ||
|
||
|
@@ -112,22 +124,23 @@ def remove_from_cache(self, storage_instance: Storage) -> None: | |
storage_type = type(storage_instance) | ||
|
||
# Remove from ID cache | ||
type_cache_by_id = self._cache_by_id.get(storage_type, {}) | ||
if storage_instance.id in type_cache_by_id: | ||
del type_cache_by_id[storage_instance.id] | ||
|
||
# Remove from name cache | ||
if storage_instance.name is not None: | ||
type_cache_by_name = self._cache_by_name.get(storage_type, {}) | ||
if storage_instance.name in type_cache_by_name: | ||
for client_cache in self._cache_by_storage_client.values(): | ||
type_cache_by_id = client_cache.by_id[storage_type] | ||
if storage_instance.id in type_cache_by_id: | ||
del type_cache_by_id[storage_instance.id] | ||
|
||
# Remove from name cache | ||
type_cache_by_name = client_cache.by_name[storage_type] | ||
if storage_instance.name in type_cache_by_name and storage_instance.name: | ||
del type_cache_by_name[storage_instance.name] | ||
|
||
# Remove from default instances | ||
if storage_type in self._default_instances and self._default_instances[storage_type] is storage_instance: | ||
del self._default_instances[storage_type] | ||
# Remove from default instances | ||
if ( | ||
storage_type in client_cache.default_instances | ||
and client_cache.default_instances[storage_type] is storage_instance | ||
): | ||
del client_cache.default_instances[storage_type] | ||
|
||
def clear_cache(self) -> None: | ||
"""Clear all cached storage instances.""" | ||
self._cache_by_id.clear() | ||
self._cache_by_name.clear() | ||
self._default_instances.clear() | ||
self._cache_by_storage_client = defaultdict(_StorageClientCache) |
Uh oh!
There was an error while loading. Please reload this page.