moj-analytical-services · RobinL · Sep 20, 2024 · Sep 4, 2024 · Sep 6, 2024 · Sep 11, 2024
diff --git a/.github/workflows/run_demos_examples.yml b/.github/workflows/run_demos_examples.yml
@@ -8,76 +8,42 @@ on:
     paths:
       - "splink/**"
       - "docs/demos/examples/**"
-      - “!docs/demos/examples/examples_index.md”
+      - "!docs/demos/examples/examples_index.md"
       - "pyproject.toml"
-
   workflow_dispatch:
 
 jobs:
   test-notebooks:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version: ["3.9"]
         test-group: ["duckdb", "spark", "sqlite"]
+    name: Test ${{ matrix.test-group }} notebooks with Python ${{ matrix.python-version }}
     steps:
-      #----------------------------------------------
-      #       check-out repo and set-up python
-      #----------------------------------------------
-      - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+
+      - name: Install poetry using pipx
+        run: pipx install poetry
+
+      - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      #----------------------------------------------
-      #  -- save a few section by caching poetry --
-      #----------------------------------------------
-      - name: Load cached Poetry installation
-        uses: actions/cache@v2
-        with:
-          path: ~/.local # the path depends on the OS
-          key: poetry-1 # increment to reset cache
-      #----------------------------------------------
-      #  -----  install & configure poetry  -----
-      #----------------------------------------------
-      - name: Install Poetry
-        uses: snok/install-poetry@v1
-        with:
-          version: "1.7.0"
-          virtualenvs-create: true
-          virtualenvs-in-project: true
-          installer-parallel: true
-      #----------------------------------------------
-      #       load cached venv if cache exists
-      #----------------------------------------------
-      - name: Load cached venv
-        id: cached-poetry-dependencies
-        uses: actions/cache@v2
-        with:
-          path: .venv
-          key: venv-demos-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-01
-      #----------------------------------------------
-      # install dependencies if cache does not exist
-      #----------------------------------------------
+          cache: "poetry"
+
       - name: Install dependencies
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction
-      #----------------------------------------------
-      #       pip install additional requirements
-      #----------------------------------------------
-      - name: Install additional requirements from demos_requirements.txt
         run: |
-          poetry run pip install -r docs/demos/data/demos_requirements.txt
-      #----------------------------------------------
-      # Modify the notebooks to reduce the amount of data processed and speed up the tests
-      #----------------------------------------------
+          poetry config virtualenvs.in-project true
+          poetry install --no-interaction --no-root
+          poetry run pip install -e .
+
+      - name: Install additional requirements from demos_requirements.txt
+        run: poetry run pip install -r docs/demos/data/demos_requirements.txt
+
       - name: Modify Notebooks to reduce data size
-        run: |
-          python scripts/reduce_notebook_runtime.py
-      #----------------------------------------------
-      # Test each notebook group in parallel
-      #----------------------------------------------
+        run: python scripts/reduce_notebook_runtime.py
+
       - name: Test ${{ matrix.test-group }} example notebooks with pytest
         run: |
-          source .venv/bin/activate
-          python -m pytest -vv --nbmake -n=auto --nbmake-kernel=python3 --durations=0 docs/demos/examples/${{ matrix.test-group }}/*ipynb
+          poetry run pytest -vv --nbmake -n=auto --nbmake-kernel=python3 --durations=0 docs/demos/examples/${{ matrix.test-group }}/*ipynb
diff --git a/.github/workflows/run_demos_tutorials.yml b/.github/workflows/run_demos_tutorials.yml
@@ -13,70 +13,36 @@ on:
   workflow_dispatch:
 
 jobs:
-  build:
+  test:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version: ["3.9"]
+    name: Run tutorial notebooks with Python ${{ matrix.python-version }}
     steps:
-      #----------------------------------------------
-      #       check-out repo and set-up python
-      #----------------------------------------------
-      - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+
+      - name: Install poetry using pipx
+        run: pipx install poetry
+
+      - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      #----------------------------------------------
-      #  -- save a few section by caching poetry --
-      #----------------------------------------------
-      - name: Load cached Poetry installation
-        uses: actions/cache@v2
-        with:
-          path: ~/.local # the path depends on the OS
-          key: poetry-1 # increment to reset cache
-      #----------------------------------------------
-      #  -----  install & configure poetry  -----
-      #----------------------------------------------
-      - name: Install Poetry
-        uses: snok/install-poetry@v1
-        with:
-          version: "1.7.0"
-          virtualenvs-create: true
-          virtualenvs-in-project: true
-          installer-parallel: true
+          cache: "poetry"
 
-      #----------------------------------------------
-      #       load cached venv if cache exists
-      #----------------------------------------------
-      - name: Load cached venv
-        id: cached-poetry-dependencies
-        uses: actions/cache@v2
-        with:
-          path: .venv
-          key: venv-demos-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-01
-      #----------------------------------------------
-      # install dependencies if cache does not exist
-      #----------------------------------------------
       - name: Install dependencies
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction
-      #----------------------------------------------
-      #       pip install additional requirements
-      #----------------------------------------------
-      - name: Install additional requirements from demos_requirements.txt
         run: |
-          poetry run pip install -r docs/demos/data/demos_requirements.txt
-      #----------------------------------------------
-      # Modify the notebooks to reduce the amount of data processed and speed up the tests
-      #----------------------------------------------
+          poetry config virtualenvs.in-project true
+          poetry install --no-interaction --no-root
+          poetry run pip install -e .
+
+      - name: Install additional requirements
+        run: poetry run pip install -r docs/demos/data/demos_requirements.txt
+
       - name: Modify Notebooks to reduce data size
+        run: python scripts/reduce_notebook_runtime.py
+
+      - name: Run tutorial notebooks
         run: |
-          python scripts/reduce_notebook_runtime.py
-      #----------------------------------------------
-      # Make sure that the demo notebooks run without error
-      #----------------------------------------------
-      - name: Test with pytest
-        run: |
-          source .venv/bin/activate
-          python -m pytest -vv --nbmake -n=auto --nbmake-kernel=python3 docs/demos/tutorials/*ipynb
+          poetry run pytest -vv --nbmake -n=auto --nbmake-kernel=python3 docs/demos/tutorials/*ipynb
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Match weight and m and u probabilities charts now have improved tooltips ([#2392](https://github.com/moj-analytical-services/splink/pull/2392))
 - Added new `AbsoluteDifferenceLevel` comparison level for numerical columns ([#2398](https://github.com/moj-analytical-services/splink/pull/2398))
 - Added new `CosineSimilarityLevel` and `CosineSimilarityAtThresholds` for comparing array columns using cosine similarity ([#2405](https://github.com/moj-analytical-services/splink/pull/2405))
+- Added new `ArraySubsetLevel` for comparing array columns ([#2416](https://github.com/moj-analytical-services/splink/pull/2416))
 
 ### Fixed
 

diff --git a/splink/comparison_level_library.py b/splink/comparison_level_library.py
@@ -4,6 +4,7 @@
     AbsoluteTimeDifferenceLevel,
     And,
     ArrayIntersectLevel,
+    ArraySubsetLevel,
     ColumnsReversedLevel,
     CosineSimilarityLevel,
     CustomLevel,
@@ -40,6 +41,7 @@
     "AbsoluteDateDifferenceLevel",
     "DistanceInKMLevel",
     "ArrayIntersectLevel",
+    "ArraySubsetLevel",
     "PercentageDifferenceLevel",
     "AbsoluteDifferenceLevel",
     "And",

diff --git a/splink/internals/comparison_level_library.py b/splink/internals/comparison_level_library.py
@@ -873,6 +873,52 @@ def create_label_for_charts(self) -> str:
         return f"Array intersection size >= {self.min_intersection}"
 
 
+class ArraySubsetLevel(ComparisonLevelCreator):
+    def __init__(self, col_name: str | ColumnExpression, empty_is_subset: bool = False):
+        """Represents a comparison level where the smaller array is an
+        exact subset of the larger array. If arrays are equal length, they
+        must have the same elements
+
+        The order of items in the arrays does not matter for this comparison.
+
+        Args:
+            col_name (str | ColumnExpression): Input column name or ColumnExpression
+            empty_is_subset (bool): If True, an empty array is considered a subset of
+                any array (including another empty array). Default is False.
+        """
+        self.col_expression = ColumnExpression.instantiate_if_str(col_name)
+        self.empty_is_subset = empty_is_subset
+
+    # Postgres not supported since it doesn't correctly deal with zero length arrays
+    @unsupported_splink_dialects(["sqlite", "postgres"])
+    def create_sql(self, sql_dialect: SplinkDialect) -> str:
+        sqlglot_dialect_name = sql_dialect.sqlglot_dialect
+
+        empty_check = ""
+        if not self.empty_is_subset:
+            empty_check = (
+                "LEAST(ARRAY_SIZE(___col____l), ARRAY_SIZE(___col____r)) <> 0 AND"
+            )
+
+        sqlglot_base_dialect_sql = f"""
+            {empty_check}
+            ARRAY_SIZE(ARRAY_INTERSECT(___col____l, ___col____r)) =
+            LEAST(ARRAY_SIZE(___col____l), ARRAY_SIZE(___col____r))
+            """
+        translated = _translate_sql_string(
+            sqlglot_base_dialect_sql, sqlglot_dialect_name
+        )
+
+        self.col_expression.sql_dialect = sql_dialect
+        col = self.col_expression
+        translated = translated.replace("___col____l", col.name_l)
+        translated = translated.replace("___col____r", col.name_r)
+        return translated
+
+    def create_label_for_charts(self) -> str:
+        return "Array subset"
+
+
 class PercentageDifferenceLevel(ComparisonLevelCreator):
     def __init__(self, col_name: str, percentage_threshold: float):
         """

diff --git a/tests/test_array_columns.py b/tests/test_array_columns.py
@@ -1,8 +1,9 @@
 import pytest
 
+import splink.internals.comparison_level_library as cll
 from splink.comparison_library import ArrayIntersectAtSizes
 from tests.decorator import mark_with_dialects_excluding
-from tests.literal_utils import run_comparison_vector_value_tests
+from tests.literal_utils import run_comparison_vector_value_tests, run_is_in_level_tests
 
 
 @mark_with_dialects_excluding("sqlite", "spark")
@@ -78,3 +79,86 @@ def test_array_comparison_1(test_helpers, dialect):
         ArrayIntersectAtSizes("postcode", [-1, 2]).get_comparison(
             db_api.sql_dialect.sqlglot_name
         )
+
+
+@mark_with_dialects_excluding("sqlite", "postgres")
+def test_array_subset(test_helpers, dialect):
+    helper = test_helpers[dialect]
+    db_api = helper.extra_linker_args()["db_api"]
+
+    test_cases = [
+        {
+            "description": "ArraySubsetLevel with empty_is_subset=False (default)",
+            "level": cll.ArraySubsetLevel("arr"),
+            "inputs": [
+                {
+                    "arr_l": ["A", "B", "C", "D"],
+                    "arr_r": ["A", "B", "C", "D"],
+                    "expected": True,
+                },
+                {
+                    "arr_l": ["A", "B", "C", "D"],
+                    "arr_r": ["A", "B", "C", "Z"],
+                    "expected": False,
+                },
+                {
+                    "arr_l": ["A", "B"],
+                    "arr_r": ["A", "B", "C", "D"],
+                    "expected": True,
+                },
+                {
+                    "arr_l": ["A", "B", "C", "D"],
+                    "arr_r": ["X", "Y", "Z"],
+                    "expected": False,
+                },
+                {
+                    "arr_l": [],
+                    "arr_r": ["X", "Y", "Z"],
+                    "expected": False,
+                },
+                {
+                    "arr_l": [],
+                    "arr_r": [],
+                    "expected": False,
+                },
+            ],
+        },
+        {
+            "description": "ArraySubsetLevel with empty_is_subset=True",
+            "level": cll.ArraySubsetLevel("arr", empty_is_subset=True),
+            "inputs": [
+                {
+                    "arr_l": ["A", "B", "C", "D"],
+                    "arr_r": ["A", "B", "C", "D"],
+                    "expected": True,
+                },
+                {
+                    "arr_l": ["A", "B", "C", "D"],
+                    "arr_r": ["A", "B", "C", "Z"],
+                    "expected": False,
+                },
+                {
+                    "arr_l": ["A", "B"],
+                    "arr_r": ["A", "B", "C", "D"],
+                    "expected": True,
+                },
+                {
+                    "arr_l": ["A", "B", "C", "D"],
+                    "arr_r": ["X", "Y", "Z"],
+                    "expected": False,
+                },
+                {
+                    "arr_l": [],
+                    "arr_r": ["X", "Y", "Z"],
+                    "expected": True,
+                },
+                {
+                    "arr_l": [],
+                    "arr_r": [],
+                    "expected": True,
+                },
+            ],
+        },
+    ]
+
+    run_is_in_level_tests(test_cases, db_api)