Skip to content

Commit

Permalink
allow clustering without match prob
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed Sep 18, 2024
1 parent 2c16b23 commit b2b7cd1
Showing 1 changed file with 20 additions and 4 deletions.
24 changes: 20 additions & 4 deletions splink/internals/linker_components/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,31 @@ def cluster_pairwise_predictions_at_threshold(

nodes_with_composite_ids = db_api.sql_pipeline_to_splink_dataframe(pipeline)

has_match_prob_col = "match_probability" in [
c.unquote().name for c in df_predict.columns
]

if not has_match_prob_col and threshold_match_probability is not None:
raise ValueError(
"df_predict must have a column called 'match_probability' if "
"threshold_match_probability is provided"
)

match_p_expr = ""
match_p_select_expr = ""
if threshold_match_probability is not None:
match_p_expr = f"where match_probability >= {threshold_match_probability}"
match_p_select_expr = ", match_probability"

pipeline = CTEPipeline([df_predict])

sql = f"""
select
{uid_concat_edges_l} as node_id_l,
{uid_concat_edges_r} as node_id_r,
match_probability
from __splink__df_predict
where match_probability >= {threshold_match_probability}
{uid_concat_edges_r} as node_id_r
{match_p_select_expr}
from {df_predict.templated_name}
{match_p_expr}
"""
pipeline.enqueue_sql(sql, "__splink__df_edges")

Expand Down

0 comments on commit b2b7cd1

Please sign in to comment.