Skip to content

Commit

Permalink
Merge pull request #40 from philbudne/fix-sort-params
Browse files Browse the repository at this point in the history
Fix MediaCloudESProvider to accept NSA sort parameters
  • Loading branch information
pgulley authored Jan 16, 2025
2 parents 1420071 + 5b6f7d7 commit 0623c1a
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 17 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Make sure `pip install flit twine` so you can build and deploy to PyPI.
4. A github action will build and push the repository on committing a tagged version

### Version History
* __v3.0.1__ - Fix ES Provider to accept sort_{order,field} paging arguments like NSA-based Provider
* __v3.0.0__ - New "OnlineNewsMediaCloudProvider" using Elasticsearch DSL for direct access to the ES cluster. Retain old provider as "OnlineNewsMediaCloudOldProvider" for now.
* __v2.2.0__ - Added an optional argument to providers to toggle caching behavior, added more specific error on 504
* __v2.1.1__ - Bugfix
Expand Down
30 changes: 15 additions & 15 deletions mc_providers/onlinenews.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,12 +659,12 @@ def _assemble_and_chunk_query_str(cls, base_query: str, chunk: bool = True, **kw
# Was publication_date, but web-search always passes indexed_date.
# identical indexed_date values (without fractional seconds?!) have
# been seen in the wild (entire day 2024-01-10).
_DEF_PAGE_SORT_FIELD = "indexed_date"
_DEF_PAGE_SORT_ORDER = "desc"
_DEF_SORT_FIELD = "indexed_date"
_DEF_SORT_ORDER = "desc"

# Secondary sort key to break ties if page_sort_field is non-empty.
# Used as sole sort key if "page_sort_field" argument or
# _DEF_PAGE_SORT_FIELD (above) is empty string.
# Secondary sort key to break ties if sort_field is non-empty.
# Used as sole sort key if "sort_field" argument or
# _DEF_SORT_FIELD (above) is empty string.
# _doc is documented as most efficient sort key at:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/sort-search-results.html
#
Expand All @@ -677,7 +677,7 @@ def _assemble_and_chunk_query_str(cls, base_query: str, chunk: bool = True, **kw
#
# HOWEVER: use of session_id/preference should route all requests
# from the same session to the same shards for each successive query.
_SECONDARY_PAGE_SORT_ARGS = {"_doc": "asc"}
_SECONDARY_SORT_ARGS = {"_doc": "asc"}

def _sanitize(s: str) -> str:
"""
Expand Down Expand Up @@ -1120,21 +1120,21 @@ def paged_items(

page_size = min(page_size, _ES_MAXPAGE)
expanded = kwargs.pop("expanded", False)
page_sort_field = kwargs.pop("page_sort_field", _DEF_PAGE_SORT_FIELD)
page_sort_order = kwargs.pop("page_sort_order", _DEF_PAGE_SORT_ORDER)
sort_field = kwargs.pop("sort_field", _DEF_SORT_FIELD)
sort_order = kwargs.pop("sort_order", _DEF_SORT_ORDER)
pagination_token = kwargs.pop("pagination_token", None)

# NOTE! depends on client limiting to reasonable choices!!
# (full text might leak data, or causes memory exhaustion!)
if page_sort_field not in self._fields(expanded):
raise ValueError(page_sort_field)
if page_sort_order not in ["asc", "desc"]:
raise ValueError(page_sort_order)
if sort_field not in self._fields(expanded):
raise ValueError(sort_field)
if sort_order not in ["asc", "desc"]:
raise ValueError(sort_order)

# see discussion above at _SECONDARY_PAGE_SORT_ARGS declaration
# see discussion above at _SECONDARY_SORT_ARGS declaration
sort_opts = [
{page_sort_field: page_sort_order},
_SECONDARY_PAGE_SORT_ARGS
{sort_field: sort_order},
_SECONDARY_SORT_ARGS
]

search = self._basic_search(query, start_date, end_date, expanded=expanded, **kwargs)\
Expand Down
44 changes: 43 additions & 1 deletion mc_providers/test/test_onlinenews.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,13 +347,17 @@ def test_paged_items(self):
query = "biden"
start_date = dt.datetime(2020, 1, 1)
end_date = dt.datetime(2023, 12, 1)
out_field = "indexed_date" # output name of sort field
story_count = self._provider.count(query, start_date, end_date)#, chunk=False)
# make sure test case is reasonable size (ie. more than one page, but not too many pages
assert story_count > 1000
# fetch first page
page1, next_token1 = self._provider.paged_items(query, start_date, end_date)#, chunk=False)
assert len(page1) > 0
assert next_token1 is not None
page1_first = page1[0][out_field]
page1_last = page1[-1][out_field]
assert page1_first > page1_last
page1_url1 = page1[0]['url']
# grab token, fetch next page
page2, next_token2 = self._provider.paged_items(query, start_date, end_date,# chunk=False,
Expand All @@ -363,6 +367,44 @@ def test_paged_items(self):
assert next_token1 != next_token2 # verify paging token changed
page2_urls = [s['url'] for s in page2]
assert page1_url1 not in page2_urls # verify pages don't overlap
page2_first = page2[0][out_field]
page2_last = page2[-1][out_field]
assert page2_first < page1_last
assert page2_first > page2_last

def test_paged_items_asc(self):
"""
check that sort_order parameter accepted and honored
"""
query = "biden"
start_date = dt.datetime(2020, 1, 1)
end_date = dt.datetime(2020, 1, 2)
out_field = "indexed_date" # output name of sort field
story_count = self._provider.count(query, start_date, end_date, chunk=False)
# make sure test case is reasonable size (ie. more than one page, but not too many pages
assert story_count > 1000
# fetch first page
page1, next_token1 = self._provider.paged_items(query, start_date, end_date, chunk=False,
sort_order="asc")
assert len(page1) > 0
assert next_token1 is not None
page1_first = page1[0][out_field]
page1_last = page1[-1][out_field]
assert page1_first < page1_last
page1_url1 = page1[0]['url']
# grab token, fetch next page
page2, next_token2 = self._provider.paged_items(query, start_date, end_date, chunk=False,
pagination_token=next_token1,
sort_order="asc")
assert len(page2) > 0
assert next_token2 is not None
assert next_token1 != next_token2 # verify paging token changed
page2_urls = [s['url'] for s in page2]
assert page1_url1 not in page2_urls # verify pages don't overlap
page2_first = page2[0][out_field]
page2_last = page2[-1][out_field]
assert page2_first > page1_last
assert page2_first < page2_last

def _test_domain_filters(self, domains: List[str]):
query = "biden"
Expand Down Expand Up @@ -461,4 +503,4 @@ def test_item(self):
story = self._provider.item(STORY_ID)
assert len(story['title']) > 0
assert story['language'] == 'en'
assert story['media_name'] == 'cnn.com'
assert story['media_name'] == 'cnn.com'
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "mc-providers"
version = "3.0.0"
version = "3.0.1"
authors = [
{name = "Paige Gulley", email = "[email protected]"},
{name = "Rahul Bhargava", email = "[email protected]"},
Expand Down

0 comments on commit 0623c1a

Please sign in to comment.