From 5b6f7d7ca5afd3b5c8cfe3405bdd21f6b6aefdc2 Mon Sep 17 00:00:00 2001 From: Phil Budne Date: Wed, 15 Jan 2025 18:06:05 -0500 Subject: [PATCH] Fix MediaCloudESProvider to accept NSA sort parameters --- README.md | 1 + mc_providers/onlinenews.py | 30 +++++++++---------- mc_providers/test/test_onlinenews.py | 44 +++++++++++++++++++++++++++- pyproject.toml | 2 +- 4 files changed, 60 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index c2320c3..0281f21 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Make sure `pip install flit twine` so you can build and deploy to PyPI. 4. A github action will build and push the repository on committing a tagged version ### Version History +* __v3.0.1__ - Fix ES Provider to accept sort_{order,field} paging arguments like NSA-based Provider * __v3.0.0__ - New "OnlineNewsMediaCloudProvider" using Elasticsearch DSL for direct access to the ES cluster. Retain old provider as "OnlineNewsMediaCloudOldProvider" for now. * __v2.2.0__ - Added an optional argument to providers to toggle caching behavior, added more specific error on 504 * __v2.1.1__ - Bugfix diff --git a/mc_providers/onlinenews.py b/mc_providers/onlinenews.py index 9f1fbbd..6d3af6c 100644 --- a/mc_providers/onlinenews.py +++ b/mc_providers/onlinenews.py @@ -659,12 +659,12 @@ def _assemble_and_chunk_query_str(cls, base_query: str, chunk: bool = True, **kw # Was publication_date, but web-search always passes indexed_date. # identical indexed_date values (without fractional seconds?!) have # been seen in the wild (entire day 2024-01-10). -_DEF_PAGE_SORT_FIELD = "indexed_date" -_DEF_PAGE_SORT_ORDER = "desc" +_DEF_SORT_FIELD = "indexed_date" +_DEF_SORT_ORDER = "desc" -# Secondary sort key to break ties if page_sort_field is non-empty. -# Used as sole sort key if "page_sort_field" argument or -# _DEF_PAGE_SORT_FIELD (above) is empty string. +# Secondary sort key to break ties if sort_field is non-empty. +# Used as sole sort key if "sort_field" argument or +# _DEF_SORT_FIELD (above) is empty string. # _doc is documented as most efficient sort key at: # https://www.elastic.co/guide/en/elasticsearch/reference/current/sort-search-results.html # @@ -677,7 +677,7 @@ def _assemble_and_chunk_query_str(cls, base_query: str, chunk: bool = True, **kw # # HOWEVER: use of session_id/preference should route all requests # from the same session to the same shards for each successive query. -_SECONDARY_PAGE_SORT_ARGS = {"_doc": "asc"} +_SECONDARY_SORT_ARGS = {"_doc": "asc"} def _sanitize(s: str) -> str: """ @@ -1120,21 +1120,21 @@ def paged_items( page_size = min(page_size, _ES_MAXPAGE) expanded = kwargs.pop("expanded", False) - page_sort_field = kwargs.pop("page_sort_field", _DEF_PAGE_SORT_FIELD) - page_sort_order = kwargs.pop("page_sort_order", _DEF_PAGE_SORT_ORDER) + sort_field = kwargs.pop("sort_field", _DEF_SORT_FIELD) + sort_order = kwargs.pop("sort_order", _DEF_SORT_ORDER) pagination_token = kwargs.pop("pagination_token", None) # NOTE! depends on client limiting to reasonable choices!! # (full text might leak data, or causes memory exhaustion!) - if page_sort_field not in self._fields(expanded): - raise ValueError(page_sort_field) - if page_sort_order not in ["asc", "desc"]: - raise ValueError(page_sort_order) + if sort_field not in self._fields(expanded): + raise ValueError(sort_field) + if sort_order not in ["asc", "desc"]: + raise ValueError(sort_order) - # see discussion above at _SECONDARY_PAGE_SORT_ARGS declaration + # see discussion above at _SECONDARY_SORT_ARGS declaration sort_opts = [ - {page_sort_field: page_sort_order}, - _SECONDARY_PAGE_SORT_ARGS + {sort_field: sort_order}, + _SECONDARY_SORT_ARGS ] search = self._basic_search(query, start_date, end_date, expanded=expanded, **kwargs)\ diff --git a/mc_providers/test/test_onlinenews.py b/mc_providers/test/test_onlinenews.py index e5f8bb2..3aadb8f 100644 --- a/mc_providers/test/test_onlinenews.py +++ b/mc_providers/test/test_onlinenews.py @@ -347,6 +347,7 @@ def test_paged_items(self): query = "biden" start_date = dt.datetime(2020, 1, 1) end_date = dt.datetime(2023, 12, 1) + out_field = "indexed_date" # output name of sort field story_count = self._provider.count(query, start_date, end_date)#, chunk=False) # make sure test case is reasonable size (ie. more than one page, but not too many pages assert story_count > 1000 @@ -354,6 +355,9 @@ def test_paged_items(self): page1, next_token1 = self._provider.paged_items(query, start_date, end_date)#, chunk=False) assert len(page1) > 0 assert next_token1 is not None + page1_first = page1[0][out_field] + page1_last = page1[-1][out_field] + assert page1_first > page1_last page1_url1 = page1[0]['url'] # grab token, fetch next page page2, next_token2 = self._provider.paged_items(query, start_date, end_date,# chunk=False, @@ -363,6 +367,44 @@ def test_paged_items(self): assert next_token1 != next_token2 # verify paging token changed page2_urls = [s['url'] for s in page2] assert page1_url1 not in page2_urls # verify pages don't overlap + page2_first = page2[0][out_field] + page2_last = page2[-1][out_field] + assert page2_first < page1_last + assert page2_first > page2_last + + def test_paged_items_asc(self): + """ + check that sort_order parameter accepted and honored + """ + query = "biden" + start_date = dt.datetime(2020, 1, 1) + end_date = dt.datetime(2020, 1, 2) + out_field = "indexed_date" # output name of sort field + story_count = self._provider.count(query, start_date, end_date, chunk=False) + # make sure test case is reasonable size (ie. more than one page, but not too many pages + assert story_count > 1000 + # fetch first page + page1, next_token1 = self._provider.paged_items(query, start_date, end_date, chunk=False, + sort_order="asc") + assert len(page1) > 0 + assert next_token1 is not None + page1_first = page1[0][out_field] + page1_last = page1[-1][out_field] + assert page1_first < page1_last + page1_url1 = page1[0]['url'] + # grab token, fetch next page + page2, next_token2 = self._provider.paged_items(query, start_date, end_date, chunk=False, + pagination_token=next_token1, + sort_order="asc") + assert len(page2) > 0 + assert next_token2 is not None + assert next_token1 != next_token2 # verify paging token changed + page2_urls = [s['url'] for s in page2] + assert page1_url1 not in page2_urls # verify pages don't overlap + page2_first = page2[0][out_field] + page2_last = page2[-1][out_field] + assert page2_first > page1_last + assert page2_first < page2_last def _test_domain_filters(self, domains: List[str]): query = "biden" @@ -461,4 +503,4 @@ def test_item(self): story = self._provider.item(STORY_ID) assert len(story['title']) > 0 assert story['language'] == 'en' - assert story['media_name'] == 'cnn.com' \ No newline at end of file + assert story['media_name'] == 'cnn.com' diff --git a/pyproject.toml b/pyproject.toml index 6ff72e7..8398b4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi" [project] name = "mc-providers" -version = "3.0.0" +version = "3.0.1" authors = [ {name = "Paige Gulley", email = "nano3.14@gmail.com"}, {name = "Rahul Bhargava", email = "r.bhargava@northeastern.edu"},