Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ArraySubset comparison level #2416

Merged
merged 12 commits into from
Sep 20, 2024
76 changes: 21 additions & 55 deletions .github/workflows/run_demos_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,76 +8,42 @@ on:
paths:
- "splink/**"
- "docs/demos/examples/**"
- !docs/demos/examples/examples_index.md
- "!docs/demos/examples/examples_index.md"
- "pyproject.toml"

workflow_dispatch:

jobs:
test-notebooks:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9"]
test-group: ["duckdb", "spark", "sqlite"]
name: Test ${{ matrix.test-group }} notebooks with Python ${{ matrix.python-version }}
steps:
#----------------------------------------------
# check-out repo and set-up python
#----------------------------------------------
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
- uses: actions/checkout@v4

- name: Install poetry using pipx
run: pipx install poetry

- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
#----------------------------------------------
# -- save a few section by caching poetry --
#----------------------------------------------
- name: Load cached Poetry installation
uses: actions/cache@v2
with:
path: ~/.local # the path depends on the OS
key: poetry-1 # increment to reset cache
#----------------------------------------------
# ----- install & configure poetry -----
#----------------------------------------------
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: "1.7.0"
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
#----------------------------------------------
# load cached venv if cache exists
#----------------------------------------------
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v2
with:
path: .venv
key: venv-demos-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-01
#----------------------------------------------
# install dependencies if cache does not exist
#----------------------------------------------
cache: "poetry"

- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction
#----------------------------------------------
# pip install additional requirements
#----------------------------------------------
- name: Install additional requirements from demos_requirements.txt
run: |
poetry run pip install -r docs/demos/data/demos_requirements.txt
#----------------------------------------------
# Modify the notebooks to reduce the amount of data processed and speed up the tests
#----------------------------------------------
poetry config virtualenvs.in-project true
poetry install --no-interaction --no-root
poetry run pip install -e .

- name: Install additional requirements from demos_requirements.txt
run: poetry run pip install -r docs/demos/data/demos_requirements.txt

- name: Modify Notebooks to reduce data size
run: |
python scripts/reduce_notebook_runtime.py
#----------------------------------------------
# Test each notebook group in parallel
#----------------------------------------------
run: python scripts/reduce_notebook_runtime.py

- name: Test ${{ matrix.test-group }} example notebooks with pytest
run: |
source .venv/bin/activate
python -m pytest -vv --nbmake -n=auto --nbmake-kernel=python3 --durations=0 docs/demos/examples/${{ matrix.test-group }}/*ipynb
poetry run pytest -vv --nbmake -n=auto --nbmake-kernel=python3 --durations=0 docs/demos/examples/${{ matrix.test-group }}/*ipynb
76 changes: 21 additions & 55 deletions .github/workflows/run_demos_tutorials.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,70 +13,36 @@ on:
workflow_dispatch:

jobs:
build:
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9"]
name: Run tutorial notebooks with Python ${{ matrix.python-version }}
steps:
#----------------------------------------------
# check-out repo and set-up python
#----------------------------------------------
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
- uses: actions/checkout@v4

- name: Install poetry using pipx
run: pipx install poetry

- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
#----------------------------------------------
# -- save a few section by caching poetry --
#----------------------------------------------
- name: Load cached Poetry installation
uses: actions/cache@v2
with:
path: ~/.local # the path depends on the OS
key: poetry-1 # increment to reset cache
#----------------------------------------------
# ----- install & configure poetry -----
#----------------------------------------------
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: "1.7.0"
virtualenvs-create: true
virtualenvs-in-project: true
installer-parallel: true
cache: "poetry"

#----------------------------------------------
# load cached venv if cache exists
#----------------------------------------------
- name: Load cached venv
id: cached-poetry-dependencies
uses: actions/cache@v2
with:
path: .venv
key: venv-demos-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-01
#----------------------------------------------
# install dependencies if cache does not exist
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction
#----------------------------------------------
# pip install additional requirements
#----------------------------------------------
- name: Install additional requirements from demos_requirements.txt
run: |
poetry run pip install -r docs/demos/data/demos_requirements.txt
#----------------------------------------------
# Modify the notebooks to reduce the amount of data processed and speed up the tests
#----------------------------------------------
poetry config virtualenvs.in-project true
poetry install --no-interaction --no-root
poetry run pip install -e .

- name: Install additional requirements
run: poetry run pip install -r docs/demos/data/demos_requirements.txt

- name: Modify Notebooks to reduce data size
run: python scripts/reduce_notebook_runtime.py

- name: Run tutorial notebooks
run: |
python scripts/reduce_notebook_runtime.py
#----------------------------------------------
# Make sure that the demo notebooks run without error
#----------------------------------------------
- name: Test with pytest
run: |
source .venv/bin/activate
python -m pytest -vv --nbmake -n=auto --nbmake-kernel=python3 docs/demos/tutorials/*ipynb
poetry run pytest -vv --nbmake -n=auto --nbmake-kernel=python3 docs/demos/tutorials/*ipynb
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Match weight and m and u probabilities charts now have improved tooltips ([#2392](https://github.com/moj-analytical-services/splink/pull/2392))
- Added new `AbsoluteDifferenceLevel` comparison level for numerical columns ([#2398](https://github.com/moj-analytical-services/splink/pull/2398))
- Added new `CosineSimilarityLevel` and `CosineSimilarityAtThresholds` for comparing array columns using cosine similarity ([#2405](https://github.com/moj-analytical-services/splink/pull/2405))
- Added new `ArraySubsetLevel` for comparing array columns ([#2416](https://github.com/moj-analytical-services/splink/pull/2416))

### Fixed

Expand Down
2 changes: 2 additions & 0 deletions splink/comparison_level_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
AbsoluteTimeDifferenceLevel,
And,
ArrayIntersectLevel,
ArraySubsetLevel,
ColumnsReversedLevel,
CosineSimilarityLevel,
CustomLevel,
Expand Down Expand Up @@ -40,6 +41,7 @@
"AbsoluteDateDifferenceLevel",
"DistanceInKMLevel",
"ArrayIntersectLevel",
"ArraySubsetLevel",
"PercentageDifferenceLevel",
"AbsoluteDifferenceLevel",
"And",
Expand Down
46 changes: 46 additions & 0 deletions splink/internals/comparison_level_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,52 @@ def create_label_for_charts(self) -> str:
return f"Array intersection size >= {self.min_intersection}"


class ArraySubsetLevel(ComparisonLevelCreator):
def __init__(self, col_name: str | ColumnExpression, empty_is_subset: bool = False):
"""Represents a comparison level where the smaller array is an
exact subset of the larger array. If arrays are equal length, they
must have the same elements

The order of items in the arrays does not matter for this comparison.

Args:
col_name (str | ColumnExpression): Input column name or ColumnExpression
empty_is_subset (bool): If True, an empty array is considered a subset of
any array (including another empty array). Default is False.
"""
self.col_expression = ColumnExpression.instantiate_if_str(col_name)
self.empty_is_subset = empty_is_subset

# Postgres not supported since it doesn't correctly deal with zero length arrays
@unsupported_splink_dialects(["sqlite", "postgres"])
def create_sql(self, sql_dialect: SplinkDialect) -> str:
sqlglot_dialect_name = sql_dialect.sqlglot_dialect

empty_check = ""
if not self.empty_is_subset:
empty_check = (
"LEAST(ARRAY_SIZE(___col____l), ARRAY_SIZE(___col____r)) <> 0 AND"
)

sqlglot_base_dialect_sql = f"""
{empty_check}
ARRAY_SIZE(ARRAY_INTERSECT(___col____l, ___col____r)) =
LEAST(ARRAY_SIZE(___col____l), ARRAY_SIZE(___col____r))
"""
translated = _translate_sql_string(
sqlglot_base_dialect_sql, sqlglot_dialect_name
)

self.col_expression.sql_dialect = sql_dialect
col = self.col_expression
translated = translated.replace("___col____l", col.name_l)
translated = translated.replace("___col____r", col.name_r)
return translated

def create_label_for_charts(self) -> str:
return "Array subset"


class PercentageDifferenceLevel(ComparisonLevelCreator):
def __init__(self, col_name: str, percentage_threshold: float):
"""
Expand Down
86 changes: 85 additions & 1 deletion tests/test_array_columns.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pytest

import splink.internals.comparison_level_library as cll
from splink.comparison_library import ArrayIntersectAtSizes
from tests.decorator import mark_with_dialects_excluding
from tests.literal_utils import run_comparison_vector_value_tests
from tests.literal_utils import run_comparison_vector_value_tests, run_is_in_level_tests


@mark_with_dialects_excluding("sqlite", "spark")
Expand Down Expand Up @@ -78,3 +79,86 @@ def test_array_comparison_1(test_helpers, dialect):
ArrayIntersectAtSizes("postcode", [-1, 2]).get_comparison(
db_api.sql_dialect.sqlglot_name
)


@mark_with_dialects_excluding("sqlite", "postgres")
def test_array_subset(test_helpers, dialect):
helper = test_helpers[dialect]
db_api = helper.extra_linker_args()["db_api"]

test_cases = [
{
"description": "ArraySubsetLevel with empty_is_subset=False (default)",
"level": cll.ArraySubsetLevel("arr"),
"inputs": [
{
"arr_l": ["A", "B", "C", "D"],
"arr_r": ["A", "B", "C", "D"],
"expected": True,
},
{
"arr_l": ["A", "B", "C", "D"],
"arr_r": ["A", "B", "C", "Z"],
"expected": False,
},
{
"arr_l": ["A", "B"],
"arr_r": ["A", "B", "C", "D"],
"expected": True,
},
{
"arr_l": ["A", "B", "C", "D"],
"arr_r": ["X", "Y", "Z"],
"expected": False,
},
{
"arr_l": [],
"arr_r": ["X", "Y", "Z"],
"expected": False,
},
{
"arr_l": [],
"arr_r": [],
"expected": False,
},
],
},
{
"description": "ArraySubsetLevel with empty_is_subset=True",
"level": cll.ArraySubsetLevel("arr", empty_is_subset=True),
"inputs": [
{
"arr_l": ["A", "B", "C", "D"],
"arr_r": ["A", "B", "C", "D"],
"expected": True,
},
{
"arr_l": ["A", "B", "C", "D"],
"arr_r": ["A", "B", "C", "Z"],
"expected": False,
},
{
"arr_l": ["A", "B"],
"arr_r": ["A", "B", "C", "D"],
"expected": True,
},
{
"arr_l": ["A", "B", "C", "D"],
"arr_r": ["X", "Y", "Z"],
"expected": False,
},
{
"arr_l": [],
"arr_r": ["X", "Y", "Z"],
"expected": True,
},
{
"arr_l": [],
"arr_r": [],
"expected": True,
},
],
},
]

run_is_in_level_tests(test_cases, db_api)
Loading