Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cluster without linker #2412

Merged
merged 15 commits into from
Sep 30, 2024
Merged

Cluster without linker #2412

merged 15 commits into from
Sep 30, 2024

Conversation

RobinL
Copy link
Member

@RobinL RobinL commented Sep 18, 2024

We've heard from several people who want to cluster without a linker. For instance if you are combining predictions from multiple models and want to cluster. e.g. #2358

This PR allows the clustering algorithm to be used without needing a linker, similar to exploratory analysis

Example without linker
from duckdb import DuckDBPyRelation

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink.internals.clustering import cluster_pairwise_predictions_at_threshold

db_api = DuckDBAPI()

nodes = [
    {"my_id": 1},
    {"my_id": 2},
    {"my_id": 3},
    {"my_id": 4},
    {"my_id": 5},
    {"my_id": 6},
]

edges = [
    {"n_1": 1, "n_2": 2, "match_probability": 0.8},
    {"n_1": 3, "n_2": 2, "match_probability": 0.9},
    {"n_1": 4, "n_2": 5, "match_probability": 0.99},
]

cluster_pairwise_predictions_at_threshold(
    nodes,
    edges,
    node_id_column_name="my_id",
    edge_id_column_name_left="n_1",
    edge_id_column_name_right="n_2",
    db_api=db_api,
    threshold_match_probability=0.5,
).as_pandas_dataframe()

nodes = [
    {"abc": 1},
    {"abc": 2},
    {"abc": 3},
    {"abc": 4},
]

edges = [
    {"abc_l": 1, "abc_r": 2, "match_probability": 0.8},
    {"abc_l": 3, "abc_r": 2, "match_probability": 0.9},
]

cluster_pairwise_predictions_at_threshold(
    nodes,
    edges,
    node_id_column_name="abc",
    db_api=db_api,
    threshold_match_probability=0.5,
).as_pandas_dataframe()
Example
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink.internals.clustering import cluster_pairwise_predictions_at_threshold

db_api = DuckDBAPI()

df = splink_datasets.fake_1000

# split df into two dfs with modulo 2
df_1 = df[df["unique_id"] % 2 == 0]
df_2 = df[df["unique_id"] % 2 == 1]

settings = SettingsCreator(
    link_type="link_and_dedupe",
    comparisons=[
        cl.ExactMatch("first_name"),
        cl.ExactMatch("surname"),
        cl.ExactMatch("dob"),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        cl.ExactMatch("email"),
    ],
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("surname"),
        block_on("dob"),
        block_on("city"),
        block_on("email"),
    ],
    max_iterations=2,
)

linker = Linker([df_1, df_2], settings, db_api, input_table_aliases=["a", "b"])
linker._settings_obj._get_source_dataset_column_name_is_required()
pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)
pairwise_predictions.as_pandas_dataframe().sort_values(["unique_id_l", "unique_id_r"])
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.00001
)


cluster_pairwise_predictions_at_threshold(
    df,
    pairwise_predictions.physical_name,
    node_id_column_name="unique_id",
    db_api=db_api,
    threshold_match_probability=0.00001,
).as_pandas_dataframe()
Also works for deterministic linking
import os

import pandas as pd

from splink import DuckDBAPI, Linker, SettingsCreator
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

# Load the data
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")

# Define blocking rules
br_for_predict = [
    "l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob",
    "l.surname = r.surname and l.dob = r.dob and l.email = r.email",
    "l.first_name = r.first_name and l.surname = r.surname and l.email = r.email",
]

# Create settings
settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=br_for_predict,
    retain_matching_columns=True,
    retain_intermediate_calculation_columns=True,
)

# Initialize DuckDB API
db_api = DuckDBAPI()


# Create linker
linker = Linker(df, settings, db_api=db_api)

# Perform deterministic linking
df_predict = linker.inference.deterministic_link()

# Cluster predictions
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    df_predict,
)

clusters.as_pandas_dataframe()



def cluster_pairwise_predictions_at_threshold(
nodes: AcceptableInputTableType,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably eventually allow the input to also be SplinkDataFrame, but i think that's for a wider PR which allows all public-API functions to accept SplinkDataFrames

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

match probability = 1 hack no longer required due to this refactor

@nabebaye
Copy link

🙌 Thanks for pushing this out! This will be extremely helpful for using Splink where the data is periodically fed live into DuckDB

@RobinL RobinL requested a review from ADBond September 27, 2024 13:28
@@ -1,18 +1,15 @@
---
tags:
- API
- Clustering
- clustering
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the old file is now linker_clustering.md to distinguish from the 'plain' linker method

@@ -2,54 +2,41 @@
import pytest

from tests.cc_testing_utils import (
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've switched all tests over to use the plain (no linker) clustering functions

return pd.DataFrame(rows)


def check_df_equality(df1, df2, skip_dtypes=False):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

syntax like assert (cc_df.values == nx_df.values).all() is sufficient so this doesn't need to be a fn

@RobinL RobinL removed the request for review from ADBond September 30, 2024 07:52
@RobinL
Copy link
Member Author

RobinL commented Sep 30, 2024

Another testing script
import duckdb
import networkx as nx
import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on


def generate_random_graph(graph_size, seed=47):
    if graph_size < 10:
        density = 1 / graph_size
    else:
        density = 2 / graph_size
    # print(f"Graph size: {graph_size}, Density: {density}")

    graph = nx.fast_gnp_random_graph(graph_size, density, seed=seed, directed=False)
    return graph


def nodes_and_edges_from_graph(G):
    edges = nx.to_pandas_edgelist(G)
    edges.columns = ["unique_id_l", "unique_id_r"]

    nodes = pd.DataFrame({"unique_id": list(G.nodes)})

    return nodes, edges


g = generate_random_graph(10000)
nodes, edges = nodes_and_edges_from_graph(g)

G = nx.from_pandas_edgelist(edges, "unique_id_l", "unique_id_r")

# Ensure all nodes from the original graph are in G
for node in nodes["unique_id"]:
    if node not in G:
        G.add_node(node)

connected_components = list(nx.connected_components(G))

# Create a dictionary mapping node to cluster
node_to_cluster = {}
for cluster_id, component in enumerate(connected_components):
    for node in component:
        node_to_cluster[node] = cluster_id

# Create the final DataFrame
nodes_with_clusters = nodes.copy()
nodes_with_clusters["cluster"] = nodes_with_clusters["unique_id"].map(node_to_cluster)


db_api = DuckDBAPI(":default:")

blocking_rules = [
    block_on("cluster"),
]


settings = SettingsCreator(
    link_type="dedupe_only",
    probability_two_random_records_match=0.5,
    blocking_rules_to_generate_predictions=blocking_rules,
    comparisons=[
        cl.ExactMatch("cluster").configure(
            m_probabilities=[0.99, 0.01], u_probabilities=[0.01, 0.99]
        )
    ],
    retain_intermediate_calculation_columns=True,
)


linker = Linker(nodes_with_clusters, settings, db_api=db_api)
linker.visualisations.match_weights_chart()

df_predict = linker.inference.predict()

res = linker.clustering.cluster_pairwise_predictions_at_threshold(
    df_predict=df_predict, threshold_match_probability=0.5
)

res_duck = res.as_duckdbpyrelation()
res_duck
sql = """
SELECT
    COUNT(DISTINCT cluster_id) AS number_of_clusters,
    AVG(cluster_size) AS average_cluster_size
FROM (
    SELECT
        cluster_id,
        COUNT(*) AS cluster_size
    FROM res_duck
    GROUP BY cluster_id
)
"""

duckdb.sql(sql)

@RobinL RobinL merged commit 5a9068b into master Sep 30, 2024
27 checks passed
@RobinL RobinL deleted the cluster_without_linker branch September 30, 2024 07:58
ADBond added a commit that referenced this pull request Oct 2, 2024
change in #2412, but only just rebased so that it affects this branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants