diff --git a/404.html b/404.html new file mode 100644 index 0000000000..ef18dae08d --- /dev/null +++ b/404.html @@ -0,0 +1,5127 @@ + + + +
+ + + + + + + + + + + + + + + + + +This section contains reference material for the modules and functions within Splink.
+Information on pre-made data tables available within Splink suitable for linking, to get up-and-running or to try out ideas.
+Reference materials for the Splink Settings dictionary:
+block_on
¶Generates blocking rules of equality conditions based on the columns +or SQL expressions specified.
+When multiple columns or SQL snippets are provided, the function generates a +compound blocking rule, connecting individual match conditions with +"AND" clauses.
+Further information on equi-join conditions can be found +here
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_names_or_exprs |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ A list of input columns or SQL conditions +you wish to create blocks on. + |
+
+ ()
+ |
+
salting_partitions |
+
+ (optional, int)
+ |
+
+
+
+ Whether to add salting +to the blocking rule. More information on salting can +be found within the docs. + |
+
+ None
+ |
+
arrays_to_explode |
+
+ (optional, List[str])
+ |
+
+
+
+ List of arrays to explode +before applying the blocking rule. + |
+
+ None
+ |
+
Examples:
+from splink import block_on
+br_1 = block_on("first_name")
+br_2 = block_on("substr(surname,1,2)", "surname")
+
splink.blocking_analysis
¶count_comparisons_from_blocking_rule(*, table_or_tables, blocking_rule, link_type, db_api, unique_id_column_name='unique_id', source_dataset_column_name=None, compute_post_filter_count=True, max_rows_limit=int(1000000000.0))
+
+¶Analyse a blocking rule to understand the number of comparisons it will generate.
+Read more about the definition of pre and post filter conditions +here
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
table_or_tables |
+
+ (dataframe, str)
+ |
+
+
+
+ Input data + |
+ + required + | +
blocking_rule |
+
+ Union[BlockingRuleCreator, str, Dict[str, Any]]
+ |
+
+
+
+ The blocking +rule to analyse + |
+ + required + | +
link_type |
+
+ user_input_link_type_options
+ |
+
+
+
+ The link type - "link_only", +"dedupe_only" or "link_and_dedupe" + |
+ + required + | +
db_api |
+
+ DatabaseAPISubClass
+ |
+
+
+
+ Database API + |
+ + required + | +
unique_id_column_name |
+
+ str
+ |
+
+
+
+ Defaults to "unique_id". + |
+
+ 'unique_id'
+ |
+
source_dataset_column_name |
+
+ Optional[str]
+ |
+
+
+
+ Defaults to None. + |
+
+ None
+ |
+
compute_post_filter_count |
+
+ bool
+ |
+
+
+
+ Whether to use a slower methodology +to calculate how many comparisons will be generated post filter conditions. +Defaults to True. + |
+
+ True
+ |
+
max_rows_limit |
+
+ int
+ |
+
+
+
+ Calculation of post filter counts will only +proceed if the fast method returns a value below this limit. Defaults +to int(1e9). + |
+
+ int(1000000000.0)
+ |
+
Returns:
+Type | +Description | +
---|---|
+ dict[str, Union[int, str]]
+ |
+
+
+
+ dict[str, Union[int, str]]: A dictionary containing the results + |
+
cumulative_comparisons_to_be_scored_from_blocking_rules_chart(*, table_or_tables, blocking_rules, link_type, db_api, unique_id_column_name='unique_id', max_rows_limit=int(1000000000.0), source_dataset_column_name=None)
+
+¶cumulative_comparisons_to_be_scored_from_blocking_rules_data(*, table_or_tables, blocking_rules, link_type, db_api, unique_id_column_name='unique_id', max_rows_limit=int(1000000000.0), source_dataset_column_name=None)
+
+¶n_largest_blocks(*, table_or_tables, blocking_rule, link_type, db_api, n_largest=5)
+
+¶Find the values responsible for creating the largest blocks of records.
+For example, when blocking on first name and surname, the 'John Smith' block +might be the largest block of records. In cases where values are highly skewed +a few values may be resonsible for generating a large proportion of all comparisons. +This function helps you find the culprit values.
+The analysis is performed pre filter conditions, read more about what this means +here
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
table_or_tables |
+
+ (dataframe, str)
+ |
+
+
+
+ Input data + |
+ + required + | +
blocking_rule |
+
+ Union[BlockingRuleCreator, str, Dict[str, Any]]
+ |
+
+
+
+ The blocking +rule to analyse + |
+ + required + | +
link_type |
+
+ user_input_link_type_options
+ |
+
+
+
+ The link type - "link_only", +"dedupe_only" or "link_and_dedupe" + |
+ + required + | +
db_api |
+
+ DatabaseAPISubClass
+ |
+
+
+
+ Database API + |
+ + required + | +
n_largest |
+
+ int
+ |
+
+
+
+ How many rows to return. Defaults to 5. + |
+
+ 5
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ 'SplinkDataFrame'
+ |
+
+
+
+ A dataframe containing the n_largest blocks + |
+
Cluster the results of the linkage model and analyse clusters, accessed via
+linker.clustering
.
cluster_pairwise_predictions_at_threshold(df_predict, threshold_match_probability=None)
+
+¶Clusters the pairwise match predictions that result from
+linker.inference.predict()
into groups of connected record using the connected
+components graph clustering algorithm
Records with an estimated match_probability
at or above
+threshold_match_probability
are considered to be a match (i.e. they represent
+the same entity).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df_predict |
+
+ SplinkDataFrame
+ |
+
+
+
+ The results of |
+ + required + | +
threshold_match_probability |
+
+ float
+ |
+
+
+
+ Pairwise comparisons with a
+ |
+
+ None
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ A SplinkDataFrame containing a list of all IDs, clustered +into groups based on the desired match threshold. + |
+
compute_graph_metrics(df_predict, df_clustered, *, threshold_match_probability=None)
+
+¶Generates tables containing graph metrics (for nodes, edges and clusters), +and returns a data class of Splink dataframes
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df_predict |
+
+ SplinkDataFrame
+ |
+
+
+
+ The results of |
+ + required + | +
df_clustered |
+
+ SplinkDataFrame
+ |
+
+
+
+ The outputs of
+ |
+ + required + | +
threshold_match_probability |
+
+ float
+ |
+
+
+
+ Filter the pairwise match
+predictions to include only pairwise comparisons with a
+match_probability at or above this threshold. If not provided, the value
+will be taken from metadata on |
+
+ None
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
GraphMetricsResult |
+ GraphMetricsResults
+ |
+
+
+
+ A data class containing SplinkDataFrames + |
+
+ GraphMetricsResults
+ |
+
+
+
+ of cluster IDs and selected node, edge or cluster metrics. +attribute "nodes" for nodes metrics table +attribute "edges" for edge metrics table +attribute "clusters" for cluster metrics table + |
+
In comparisons, you may wish to consider expressions which are not simply columns of your input table.
+For instance you may have a forename
column in your data, but when comparing records you may wish to also use the values in this column transformed all to lowercase, or just the first three letters of the name, or perhaps both of these transformations taken together.
If it is feasible to do so, then it may be best to derive a new column containing the transformed data. +Particularly if it is an expensive calculation, or you wish to refer to it many times, deriving the column once on your input data may well be preferable, as it is cheaper than doing so directly in comparisons where each input record may need to be processed many times. +However, there may be situations where you don't wish to derive a new column, perhaps for large data where you have many such transformations, or when you are experimenting with different models.
+This is where a ColumnExpression
may be used. It represents some SQL expression, which may be a column, or some more complicated construct,
+to which you can also apply zero or more transformations. These are lazily evaluated, and in particular will not be tied to a specific SQL dialect until they are put (via settings into a linker).
One caveat to using a ColumnExpression
is that it cannot be combined with term frequency adjustments.
+Term frequency adjustments can only be computed on the raw values in a column prior to any function transforms.
If you wish to use term frequencies with transformations of an input column, you must pre-compute a new column in your input data
+with the transforms applied, instead of a ColumnExpression
.
from splink import ColumnExpression
+
+email_lowercase = ColumnExpression("email").lower()
+dob_as_string = ColumnExpression("dob").cast_to_string()
+surname_initial_lowercase = ColumnExpression("surname").substr(1, 1).lower()
+entry_date = ColumnExpression("entry_date_str").try_parse_date(date_format="YYYY-MM-DD")
+full_name_lowercase = ColumnExpression("first_name || ' ' || surname").lower()
+
You can use a ColumnExpression
in most places where you might also use a simple column name, such as in a library comparison, a library comparison level, or in a blocking rule:
from splink import block_on
+import splink.comparison_library as cl
+import splink.comparison_level_library as cll
+
+full_name_lower_br = block_on([full_name_lowercase])
+
+email_comparison = cl.DamerauLevenshteinAtThresholds(email_lowercase, distance_threshold_or_thresholds=[1, 3])
+entry_date_comparison = cl.AbsoluteTimeDifferenceAtThresholds(
+ entry_date,
+ input_is_string=False,
+ metrics=["day", "day"],
+ thresholds=[1, 10],
+)
+name_comparison = cl.CustomComparison(
+ comparison_levels=[
+ cll.NullLevel(full_name_lowercase),
+ cll.ExactMatch(full_name_lowercase),
+ cll.ExactMatch("surname")
+ cll.ExactMatch("first_name"),
+ cll.ExactMatch(surname_initial_lowercase),
+ cll.ElseLevel()
+ ],
+ output_column_name="name",
+)
+
ColumnExpression
¶Enables transforms to be applied to a column before it's passed into a +comparison level.
+Dialect agnostic. Execution is delayed until the dialect is known.
+ + +from splink.column_expression import ColumnExpression
+col = (
+ ColumnExpression("first_name")
+ .lower()
+ .regex_extract("^[A-Z]{1,4}")
+)
+
+ExactMatchLevel(col)
+
Note that this will typically be created without a dialect, and the dialect
+will later be populated when the ColumnExpression
is passed via a comparison
+level creator into a Linker
.
lower()
+
+¶Applies a lowercase transform to the input expression.
+ +substr(start, length)
+
+¶Applies a substring transform to the input expression of a given length +starting from a specified index.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
start |
+
+ int
+ |
+
+
+
+ The starting index of the substring. + |
+ + required + | +
length |
+
+ int
+ |
+
+
+
+ The length of the substring. + |
+ + required + | +
cast_to_string()
+
+¶Applies a cast to string transform to the input expression.
+ +regex_extract(pattern, capture_group=0)
+
+¶Applies a regex extract transform to the input expression.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
pattern |
+
+ str
+ |
+
+
+
+ The regex pattern to match. + |
+ + required + | +
capture_group |
+
+ int
+ |
+
+
+
+ The capture group to extract from the matched pattern. +Defaults to 0, meaning the full pattern is extracted + |
+
+ 0
+ |
+
try_parse_date(date_format=None)
+
+¶Applies a 'try parse date' transform to the input expression.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
date_format |
+
+ str
+ |
+
+
+
+ The date format to attempt to parse. +Defaults to None, meaning the dialect-specific default format is used. + |
+
+ None
+ |
+
try_parse_timestamp(timestamp_format=None)
+
+¶Applies a 'try parse timestamp' transform to the input expression.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
timestamp_format |
+
+ str
+ |
+
+
+
+ The timestamp format to attempt to parse. +Defaults to None, meaning the dialect-specific default format is used. + |
+
+ None
+ |
+
comparison_level_library
¶AbsoluteDifferenceLevel(col_name, difference_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level where the absolute difference between two +numerical values is within a specified threshold.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str | ColumnExpression
+ |
+
+
+
+ Input column name or ColumnExpression. + |
+ + required + | +
difference_threshold |
+
+ int | float
+ |
+
+
+
+ The maximum allowed absolute difference +between the two values. + |
+ + required + | +
AbsoluteTimeDifferenceLevel(col_name, *, input_is_string, threshold, metric, datetime_format=None)
+
+¶
+ Bases: ComparisonLevelCreator
Computes the absolute elapsed time between two dates (total duration).
+This function computes the amount of time that has passed between two dates,
+in contrast to functions like date_diff
found in some SQL backends,
+which count the number of full calendar intervals (e.g., months, years) crossed.
For instance, the difference between January 29th and March 2nd would be less
+than two months in terms of elapsed time, unlike a date_diff
calculation that
+would give an answer of 2 calendar intervals crossed.
That the thresold is inclusive e.g. a level with a 10 day threshold +will include difference in date of 10 days.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the input column containing the dates to compare + |
+ + required + | +
input_is_string |
+
+ bool
+ |
+
+
+
+ Indicates if the input date/times are in
+string format, requiring parsing according to |
+ + required + | +
threshold |
+
+ int
+ |
+
+
+
+ The maximum allowed difference between the two dates,
+in units specified by |
+ + required + | +
metric |
+
+ str
+ |
+
+
+
+ The unit of time to use when comparing the dates. +Can be 'second', 'minute', 'hour', 'day', 'month', or 'year'. + |
+ + required + | +
datetime_format |
+
+ str
+ |
+
+
+
+ The format string for parsing dates. +ISO 8601 format used if not provided. + |
+
+ None
+ |
+
And(*comparison_levels)
+
+¶
+ Bases: _Merge
Represents a comparison level that is an 'AND' of other comparison levels
+Merge multiple ComparisonLevelCreators into a single ComparisonLevelCreator by +merging their SQL conditions using a logical "AND".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*comparison_levels |
+
+ ComparisonLevelCreator | dict
+ |
+
+
+
+ These represent the +comparison levels you wish to combine via 'AND' + |
+
+ ()
+ |
+
ArrayIntersectLevel(col_name, min_intersection)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level based around the size of an intersection of +arrays
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
min_intersection |
+
+ int
+ |
+
+
+
+ The minimum cardinality of the +intersection of arrays for this comparison level. Defaults to 1 + |
+ + required + | +
ColumnsReversedLevel(col_name_1, col_name_2, symmetrical=False)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level where the columns are reversed. For example, +if surname is in the forename field and vice versa
+By default, col_l = col_r. If the symmetrical argument is True, then +col_l = col_r AND col_r = col_l.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name_1 |
+
+ str
+ |
+
+
+
+ First column, e.g. forename + |
+ + required + | +
col_name_2 |
+
+ str
+ |
+
+
+
+ Second column, e.g. surname + |
+ + required + | +
symmetrical |
+
+ bool
+ |
+
+
+
+ If True, equality is required in in both directions. +Default is False. + |
+
+ False
+ |
+
CustomLevel(sql_condition, label_for_charts=None, base_dialect_str=None)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level with a custom sql expression
+Must be in a form suitable for use in a SQL CASE WHEN expression +e.g. "substr(name_l, 1, 1) = substr(name_r, 1, 1)"
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sql_condition |
+
+ str
+ |
+
+
+
+ SQL condition to assess similarity + |
+ + required + | +
label_for_charts |
+
+ str
+ |
+
+
+
+ A label for this level to be used in
+charts. Default None, so that |
+
+ None
+ |
+
base_dialect_str |
+
+ str
+ |
+
+
+
+ If specified, the SQL dialect that +this expression will parsed as when attempting to translate to +other backends + |
+
+ None
+ |
+
DamerauLevenshteinLevel(col_name, distance_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
A comparison level using a Damerau-Levenshtein distance function
+e.g. damerau_levenshtein(val_l, val_r) <= distance_threshold
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
distance_threshold |
+
+ int
+ |
+
+
+
+ The threshold to use to assess +similarity + |
+ + required + | +
DistanceFunctionLevel(col_name, distance_function_name, distance_threshold, higher_is_more_similar=True)
+
+¶
+ Bases: ComparisonLevelCreator
A comparison level using an arbitrary distance function
+e.g. custom_distance(val_l, val_r) >= (<=) distance_threshold
The function given by distance_function_name
must exist in the SQL
+backend you use, and must take two parameters of the type in `col_name,
+returning a numeric type
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str | ColumnExpression
+ |
+
+
+
+ Input column name + |
+ + required + | +
distance_function_name |
+
+ str
+ |
+
+
+
+ the name of the SQL distance function + |
+ + required + | +
distance_threshold |
+
+ Union[int, float]
+ |
+
+
+
+ The threshold to use to assess +similarity + |
+ + required + | +
higher_is_more_similar |
+
+ bool
+ |
+
+
+
+ Are higher values of the distance function +more similar? (e.g. True for Jaro-Winkler, False for Levenshtein) +Default is True + |
+
+ True
+ |
+
DistanceInKMLevel(lat_col, long_col, km_threshold, not_null=False)
+
+¶
+ Bases: ComparisonLevelCreator
Use the haversine formula to transform comparisons of lat,lngs +into distances measured in kilometers
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
lat_col |
+
+ str
+ |
+
+
+
+ The name of a latitude column or the respective array +or struct column column containing the information +For example: long_lat['lat'] or long_lat[0] + |
+ + required + | +
long_col |
+
+ str
+ |
+
+
+
+ The name of a longitudinal column or the respective array +or struct column column containing the information, plus an index. +For example: long_lat['long'] or long_lat[1] + |
+ + required + | +
km_threshold |
+
+ int
+ |
+
+
+
+ The total distance in kilometers to evaluate your +comparisons against + |
+ + required + | +
not_null |
+
+ bool
+ |
+
+
+
+ If true, ensure no attempt is made to compute this if +any inputs are null. This is only necessary if you are not +capturing nulls elsewhere in your comparison level. + |
+
+ False
+ |
+
ElseLevel
+
+
+¶
+ Bases: ComparisonLevelCreator
This level is used to capture all comparisons that do not match any other +specified levels. It corresponds to the ELSE clause in a SQL CASE statement.
+ + + + +ExactMatchLevel(col_name, term_frequency_adjustments=False)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level where there is an exact match
+e.g. val_l = val_r
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
term_frequency_adjustments |
+
+ bool
+ |
+
+
+
+ If True, apply term frequency +adjustments to the exact match level. Defaults to False. + |
+
+ False
+ |
+
JaccardLevel(col_name, distance_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
A comparison level using a Jaccard distance function
+e.g. jaccard(val_l, val_r) >= distance_threshold
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
distance_threshold |
+
+ Union[int, float]
+ |
+
+
+
+ The threshold to use to assess +similarity + |
+ + required + | +
JaroLevel(col_name, distance_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
A comparison level using a Jaro distance function
+e.g. jaro(val_l, val_r) >= distance_threshold
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
distance_threshold |
+
+ Union[int, float]
+ |
+
+
+
+ The threshold to use to assess +similarity + |
+ + required + | +
JaroWinklerLevel(col_name, distance_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
A comparison level using a Jaro-Winkler distance function
+e.g. jaro_winkler(val_l, val_r) >= distance_threshold
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
distance_threshold |
+
+ Union[int, float]
+ |
+
+
+
+ The threshold to use to assess +similarity + |
+ + required + | +
LevenshteinLevel(col_name, distance_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
A comparison level using a sqlglot_dialect_name distance function
+e.g. levenshtein(val_l, val_r) <= distance_threshold
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name + |
+ + required + | +
distance_threshold |
+
+ int
+ |
+
+
+
+ The threshold to use to assess +similarity + |
+ + required + | +
LiteralMatchLevel(col_name, literal_value, literal_datatype, side_of_comparison='both')
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level where a column matches a literal value
+e.g. val_l = 'literal' AND/OR val_r = 'literal'
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ Input column name or +ColumnExpression + |
+ + required + | +
literal_value |
+
+ str
+ |
+
+
+
+ The literal value to compare against e.g. 'male' + |
+ + required + | +
literal_datatype |
+
+ str
+ |
+
+
+
+ The datatype of the literal value. +Must be one of: "string", "int", "float", "date" + |
+ + required + | +
side_of_comparison |
+
+ str
+ |
+
+
+
+ Which side(s) of the comparison to +apply. Must be one of: "left", "right", "both". Defaults to "both". + |
+
+ 'both'
+ |
+
Not(comparison_level)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level that is the negation of another comparison level
+Resulting ComparisonLevelCreator is equivalent to the passed ComparisonLevelCreator +but with SQL conditions negated with logical "NOY".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*comparison_level |
+
+ ComparisonLevelCreator | dict
+ |
+
+
+
+ This represents the +comparison level you wish to negate with 'NOT' + |
+ + required + | +
NullLevel(col_name, valid_string_pattern=None)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level where either or both values are NULL
+e.g. val_l IS NULL OR val_r IS NULL
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ Input column name or ColumnExpression + |
+ + required + | +
valid_string_pattern |
+
+ str
+ |
+
+
+
+ If provided, a regex pattern to extract +a valid substring from the column before checking for NULL. Default is None. + |
+
+ None
+ |
+
If a valid_string_pattern is provided, the NULL check will be performed on +the extracted substring rather than the original column value.
+Or(*comparison_levels)
+
+¶
+ Bases: _Merge
Represents a comparison level that is an 'OR' of other comparison levels
+Merge multiple ComparisonLevelCreators into a single ComparisonLevelCreator by +merging their SQL conditions using a logical "OR".
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
*comparison_levels |
+
+ ComparisonLevelCreator | dict
+ |
+
+
+
+ These represent the +comparison levels you wish to combine via 'OR' + |
+
+ ()
+ |
+
PercentageDifferenceLevel(col_name, percentage_threshold)
+
+¶
+ Bases: ComparisonLevelCreator
Represents a comparison level where the difference between two numerical +values is within a specified percentage threshold.
+The percentage difference is calculated as the absolute difference between the +two values divided by the greater of the two values.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ Input column name. + |
+ + required + | +
percentage_threshold |
+
+ float
+ |
+
+
+
+ The threshold percentage to use +to assess similarity e.g. 0.1 for 10%. + |
+ + required + | +
An alias of AbsoluteTimeDifferenceAtThresholds.
+Note that all comparison levels have a .configure()
method as follows:
Configure the comparison level with options which are common to all +comparison levels. The options align to the keys in the json +specification of a comparison level. These options are usually not +needed, but are available for advanced users.
+All options have default options set initially. Any call to .configure()
+will set any options that are supplied. Any subsequent calls to .configure()
+will not override these values with defaults; to override values you must must
+explicitly provide a value corresponding to the default.
Generally speaking only a single call (at most) to .configure()
should
+be required.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
m_probability |
+
+ float
+ |
+
+
+
+ The m probability for this +comparison level. +Default is equivalent to None, in which case a default initial value +will be provided for this level. + |
+
+ unsupplied_option
+ |
+
u_probability |
+
+ float
+ |
+
+
+
+ The u probability for this +comparison level. +Default is equivalent to None, in which case a default initial value +will be provided for this level. + |
+
+ unsupplied_option
+ |
+
tf_adjustment_column |
+
+ str
+ |
+
+
+
+ Make term frequency adjustments for +this comparison level using this input column. +Default is equivalent to None, meaning that term-frequency adjustments +will not be applied for this level. + |
+
+ unsupplied_option
+ |
+
tf_adjustment_weight |
+
+ float
+ |
+
+
+
+ Make term frequency adjustments +for this comparison level using this weight. +Default is equivalent to None, meaning term-frequency adjustments are +fully-weighted if turned on. + |
+
+ unsupplied_option
+ |
+
tf_minimum_u_value |
+
+ float
+ |
+
+
+
+ When term frequency adjustments are +turned on, where the term frequency adjustment implies a u value below +this value, use this minimum value instead. +Defaults is equivalent to None, meaning no minimum value. + |
+
+ unsupplied_option
+ |
+
is_null_level |
+
+ bool
+ |
+
+
+
+ If true, m and u values will not be +estimated and instead the match weight will be zero for this column. +Default is equivalent to False. + |
+
+ unsupplied_option
+ |
+
label_for_charts |
+
+ str
+ |
+
+
+
+ If provided, a custom label that will +be used for this level in any charts. +Default is equivalent to None, in which case a default label will be +provided for this level. + |
+
+ unsupplied_option
+ |
+
disable_tf_exact_match_detection |
+
+ bool
+ |
+
+
+
+ If true, if term +frequency adjustments are set, the corresponding adjustment will be +made using the u-value for this level, rather than the usual case +where it is the u-value of the exact match level in the same comparison. +Default is equivalent to False. + |
+
+ unsupplied_option
+ |
+
fix_m_probability |
+
+ bool
+ |
+
+
+
+ If true, the m probability for this +level will be fixed and not estimated during training. +Default is equivalent to False. + |
+
+ unsupplied_option
+ |
+
fix_u_probability |
+
+ bool
+ |
+
+
+
+ If true, the u probability for this +level will be fixed and not estimated during training. +Default is equivalent to False. + |
+
+ unsupplied_option
+ |
+
comparison_library
¶AbsoluteTimeDifferenceAtThresholds(col_name, *, input_is_string, metrics, thresholds, datetime_format=None, term_frequency_adjustments=False, invalid_dates_as_null=True)
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with multiple levels based on
+absolute time differences:
col_name
For example, with metrics = ['day', 'month'] and thresholds = [1, 3] the levels +are:
+col_name
col_name
<= 1 daycol_name
<= 3 monthsThis comparison uses the AbsoluteTimeDifferenceLevel, which computes the total +elapsed time between two dates, rather than counting calendar intervals.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
input_is_string |
+
+ bool
+ |
+
+
+
+ If True, the input dates are treated as strings
+and parsed according to |
+ + required + | +
metrics |
+
+ Union[DateMetricType, List[DateMetricType]]
+ |
+
+
+
+ The unit(s) of time +to use when comparing dates. Can be 'second', 'minute', 'hour', 'day', +'month', or 'year'. + |
+ + required + | +
thresholds |
+
+ Union[int, float, List[Union[int, float]]]
+ |
+
+
+
+ The threshold(s) +to use for the time difference level(s). + |
+ + required + | +
datetime_format |
+
+ str
+ |
+
+
+
+ The format string for parsing dates if
+ |
+
+ None
+ |
+
term_frequency_adjustments |
+
+ bool
+ |
+
+
+
+ Whether to apply term frequency +adjustments. Defaults to False. + |
+
+ False
+ |
+
invalid_dates_as_null |
+
+ bool
+ |
+
+
+
+ If True and |
+
+ True
+ |
+
ArrayIntersectAtSizes(col_name, size_threshold_or_thresholds=[1])
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with multiple levels based on
+the intersection sizes of array elements:
For example, with size_threshold_or_thresholds = [3, 1], the levels are:
+col_name
has at least 3 elementscol_name
has at least 1 elementParameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
size_threshold_or_thresholds |
+
+ Union[int, list[int]]
+ |
+
+
+
+ The +size threshold(s) for the intersection levels. +Defaults to [1]. + |
+
+ [1]
+ |
+
CustomComparison(comparison_levels, output_column_name=None, comparison_description=None)
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data with custom supplied levels.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_column_name |
+
+ str
+ |
+
+
+
+ The column name to use to refer to this comparison + |
+
+ None
+ |
+
comparison_levels |
+
+ list
+ |
+
+
+
+ A list of some combination of
+ |
+ + required + | +
comparison_description |
+
+ str
+ |
+
+
+
+ An optional description of the +comparison + |
+
+ None
+ |
+
DamerauLevenshteinAtThresholds(col_name, distance_threshold_or_thresholds=[1, 2])
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with three or more levels:
col_name
For example, with distance_threshold_or_thresholds = [1, 3] the levels are
+col_name
col_name
<= 1col_name
<= 3Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
distance_threshold_or_thresholds |
+
+ Union[int, list]
+ |
+
+
+
+ The +threshold(s) to use for the Damerau-Levenshtein similarity level(s). +Defaults to [1, 2]. + |
+
+ [1, 2]
+ |
+
DateOfBirthComparison(col_name, *, input_is_string, datetime_thresholds=[1, 1, 10], datetime_metrics=['month', 'year', 'year'], datetime_format=None, invalid_dates_as_null=True)
+
+¶
+ Bases: ComparisonCreator
Generate an 'out of the box' comparison for a date of birth column
+in the col_name
provided.
Note that input_is_string
is a required argument: you must denote whether the
+col_name
contains if of type date/dattime or string.
The default arguments will give a comparison with comparison levels:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name + |
+ + required + | +
input_is_string |
+
+ bool
+ |
+
+
+
+ If True, the provided |
+ + required + | +
datetime_thresholds |
+
+ Union[int, float, List[Union[int, float]]]
+ |
+
+
+
+ Numeric thresholds for date differences. Defaults to [1, 1, 10]. + |
+
+ [1, 1, 10]
+ |
+
datetime_metrics |
+
+ Union[DateMetricType, List[DateMetricType]]
+ |
+
+
+
+ Metrics for date differences. Defaults to ["month", "year", "year"]. + |
+
+ ['month', 'year', 'year']
+ |
+
datetime_format |
+
+ str
+ |
+
+
+
+ The datetime format used to cast strings +to dates. Only used if input is a string. + |
+
+ None
+ |
+
invalid_dates_as_null |
+
+ bool
+ |
+
+
+
+ If True, treat invalid dates as null +as opposed to allowing e.g. an exact or levenshtein match where one side +or both are an invalid date. Only used if input is a string. Defaults +to True. + |
+
+ True
+ |
+
DistanceFunctionAtThresholds(col_name, distance_function_name, distance_threshold_or_thresholds, higher_is_more_similar=True)
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with three or more levels:
col_name
For example, with distance_threshold_or_thresholds = [1, 3] +and distance_function 'hamming', with higher_is_more_similar False +the levels are:
+col_name
col_name
<= 1col_name
<= 3Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
distance_function_name |
+
+ str
+ |
+
+
+
+ the name of the SQL distance function + |
+ + required + | +
distance_threshold_or_thresholds |
+
+ Union[float, list]
+ |
+
+
+
+ The +threshold(s) to use for the distance function level(s). + |
+ + required + | +
higher_is_more_similar |
+
+ bool
+ |
+
+
+
+ Are higher values of the distance function +more similar? (e.g. True for Jaro-Winkler, False for Levenshtein) +Default is True + |
+
+ True
+ |
+
DistanceInKMAtThresholds(lat_col, long_col, km_thresholds)
+
+¶
+ Bases: ComparisonCreator
A comparison of the latitude, longitude coordinates defined in +'lat_col' and 'long col' giving the great circle distance between them in km.
+An example of the output with km_thresholds = [1, 10] would be:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
lat_col |
+
+ str
+ |
+
+
+
+ The name of the latitude column to compare. + |
+ + required + | +
long_col |
+
+ str
+ |
+
+
+
+ The name of the longitude column to compare. + |
+ + required + | +
km_thresholds |
+
+ iterable[float] | float
+ |
+
+
+
+ The km threshold(s) for the +distance levels. + |
+ + required + | +
EmailComparison(col_name)
+
+¶
+ Bases: ComparisonCreator
Generate an 'out of the box' comparison for an email address column with the
+in the col_name
provided.
The default comparison levels are:
+john@smith.com
vs. john@smith.com
.john@company.com
vs.
+john@other.com
.john.smith@company.com
+vs. john.smyth@company.com
.john.smith@company.com
vs. john.smyth@other.com
.john@company.com
vs. rebecca@other.com
.Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or expression for +the email addresses to be compared. + |
+ + required + | +
ExactMatch(col_name)
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with two levels:
col_name
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare + |
+ + required + | +
ForenameSurnameComparison(forename_col_name, surname_col_name, *, jaro_winkler_thresholds=[0.92, 0.88], forename_surname_concat_col_name=None)
+
+¶
+ Bases: ComparisonCreator
Generate an 'out of the box' comparison for forename and surname columns
+in the forename_col_name
and surname_col_name
provided.
It's recommended to derive an additional column containing a concatenated
+forename and surname column so that term frequencies can be applied to the
+full name. If you have derived a column, provide it at
+forename_surname_concat_col_name
.
The default comparison levels are:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
forename_col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or +expression for the forenames to be compared. + |
+ + required + | +
surname_col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or +expression for the surnames to be compared. + |
+ + required + | +
jaro_winkler_thresholds |
+
+ Union[float, list[float]]
+ |
+
+
+
+ Thresholds +for Jaro-Winkler similarity. Defaults to [0.92, 0.88]. + |
+
+ [0.92, 0.88]
+ |
+
forename_surname_concat_col_name |
+
+ str
+ |
+
+
+
+ The column name for +concatenated forename and surname values. If provided, term +frequencies are applied on the exact match using this column + |
+
+ None
+ |
+
JaccardAtThresholds(col_name, score_threshold_or_thresholds=[0.9, 0.7])
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with three or more levels:
col_name
For example, with score_threshold_or_thresholds = [0.9, 0.7] the levels are:
+col_name
col_name
>= 0.9col_name
>= 0.7Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
score_threshold_or_thresholds |
+
+ Union[float, list]
+ |
+
+
+
+ The +threshold(s) to use for the Jaccard similarity level(s). +Defaults to [0.9, 0.7]. + |
+
+ [0.9, 0.7]
+ |
+
JaroAtThresholds(col_name, score_threshold_or_thresholds=[0.9, 0.7])
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with three or more levels:
col_name
For example, with score_threshold_or_thresholds = [0.9, 0.7] the levels are:
+col_name
col_name
>= 0.9col_name
>= 0.7Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
score_threshold_or_thresholds |
+
+ Union[float, list]
+ |
+
+
+
+ The +threshold(s) to use for the Jaro similarity level(s). +Defaults to [0.9, 0.7]. + |
+
+ [0.9, 0.7]
+ |
+
JaroWinklerAtThresholds(col_name, score_threshold_or_thresholds=[0.9, 0.7])
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with three or more levels:
col_name
For example, with score_threshold_or_thresholds = [0.9, 0.7] the levels are:
+col_name
col_name
>= 0.9col_name
>= 0.7Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare. + |
+ + required + | +
score_threshold_or_thresholds |
+
+ Union[float, list]
+ |
+
+
+
+ The +threshold(s) to use for the Jaro-Winkler similarity level(s). +Defaults to [0.9, 0.7]. + |
+
+ [0.9, 0.7]
+ |
+
LevenshteinAtThresholds(col_name, distance_threshold_or_thresholds=[1, 2])
+
+¶
+ Bases: ComparisonCreator
Represents a comparison of the data in col_name
with three or more levels:
col_name
For example, with distance_threshold_or_thresholds = [1, 3] the levels are
+col_name
col_name
<= 1col_name
<= 3Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ str
+ |
+
+
+
+ The name of the column to compare + |
+ + required + | +
distance_threshold_or_thresholds |
+
+ Union[int, list]
+ |
+
+
+
+ The +threshold(s) to use for the levenshtein similarity level(s). +Defaults to [1, 2]. + |
+
+ [1, 2]
+ |
+
NameComparison(col_name, *, jaro_winkler_thresholds=[0.92, 0.88, 0.7], dmeta_col_name=None)
+
+¶
+ Bases: ComparisonCreator
Generate an 'out of the box' comparison for a name column in the col_name
+provided.
It's also possible to include a level for a dmetaphone match, but this requires +you to derive a dmetaphone column prior to importing it into Splink. Note +this is expected to be a column containing arrays of dmetaphone values, which +are of length 1 or 2.
+The default comparison levels are:
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or expression for +the names to be compared. + |
+ + required + | +
jaro_winkler_thresholds |
+
+ Union[float, list[float]]
+ |
+
+
+
+ Thresholds +for Jaro-Winkler similarity. Defaults to [0.92, 0.88, 0.7]. + |
+
+ [0.92, 0.88, 0.7]
+ |
+
dmeta_col_name |
+
+ str
+ |
+
+
+
+ The column name for dmetaphone values. +If provided, array intersection level is included. This column must +contain arrays of dmetaphone values, which are of length 1 or 2. + |
+
+ None
+ |
+
PostcodeComparison(col_name, *, invalid_postcodes_as_null=False, lat_col=None, long_col=None, km_thresholds=[1, 10, 100])
+
+¶
+ Bases: ComparisonCreator
Generate an 'out of the box' comparison for a postcode column with the
+in the col_name
provided.
The default comparison levels are:
+It's also possible to include levels for distance in km, but this requires
+you to have geocoded your postcodes prior to importing them into Splink. Use
+the lat_col
and long_col
arguments to tell Splink where to find the
+latitude and longitude columns.
See https://ideal-postcodes.co.uk/guides/uk-postcode-format +for definitions
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col_name |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or expression for +the postcodes to be compared. + |
+ + required + | +
invalid_postcodes_as_null |
+
+ bool
+ |
+
+
+
+ If True, treat invalid postcodes +as null. Defaults to False. + |
+
+ False
+ |
+
lat_col |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or
+expression for latitude. Required if |
+
+ None
+ |
+
long_col |
+
+ Union[str, ColumnExpression]
+ |
+
+
+
+ The column name or
+expression for longitude. Required if |
+
+ None
+ |
+
km_thresholds |
+
+ Union[float, List[float]]
+ |
+
+
+
+ Thresholds for distance
+in kilometers. If provided, |
+
+ [1, 10, 100]
+ |
+
An alias of AbsoluteTimeDifferenceAtThresholds.
+Note that all comparisons have a .configure()
method as follows:
Configure the comparison creator with options that are common to all +comparisons.
+For m and u probabilities, the first +element in the list corresponds to the first comparison level, usually +an exact match level. Subsequent elements correspond comparison to +levels in sequential order, through to the last element which is usually +the 'ELSE' level.
+All options have default options set initially. Any call to .configure()
+will set any options that are supplied. Any subsequent calls to .configure()
+will not override these values with defaults; to override values you must
+explicitly provide a value corresponding to the default.
Generally speaking only a single call (at most) to .configure()
should
+be required.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
term_frequency_adjustments |
+
+ bool
+ |
+
+
+
+ Whether term frequency +adjustments are switched on for this comparison. Only applied +to exact match levels. +Default corresponds to False. + |
+
+ unsupplied_option
+ |
+
m_probabilities |
+
+ list
+ |
+
+
+
+ List of m probabilities +Default corresponds to None. + |
+
+ unsupplied_option
+ |
+
u_probabilities |
+
+ list
+ |
+
+
+
+ List of u probabilities +Default corresponds to None. + |
+
+ unsupplied_option
+ |
+
cc = LevenshteinAtThresholds("name", 2)
+cc.configure(
+ m_probabilities=[0.9, 0.08, 0.02],
+ u_probabilities=[0.01, 0.05, 0.94]
+ # probabilities for exact match level, levenshtein <= 2, and else
+ # in that order
+)
+
Splink has some datasets available for use to help you get up and running, test ideas, or explore Splink features.
+To use, simply import splink_datasets
:
+
from splink import splink_datasets
+
+df = splink_datasets.fake_1000
+
from splink splink_datasets, Linker, DuckDBAPI, SettingsCreator
+
+df = splink_datasets.fake_1000
+linker = Linker(
+ df,
+ SettingsCreator(
+ link_type="dedupe_only",
+ comparisons=[
+ cl.exact_match("first_name"),
+ cl.exact_match("surname"),
+ ],
+ ),
+ db_api=DuckDBAPI()
+)
+
If you get a SSLCertVerificationError
when trying to use the inbuilt datasets, this can be fixed with the ssl
package by running:
ssl._create_default_https_context = ssl._create_unverified_context
.
splink_datasets
¶Each attribute of splink_datasets
is a dataset available for use, which exists as a pandas DataFrame
.
+These datasets are not packaged directly with Splink, but instead are downloaded only when they are requested.
+Once requested they are cached for future use.
+The cache can be cleared using splink_dataset_utils
(see below),
+which also contains information on available datasets, and which have already been cached.
The datasets available are listed below:
+dataset name | +description | +rows | +unique entities | +link to source | +
---|---|---|---|---|
fake_1000 |
+Fake 1000 from splink demos. Records are 250 simulated people, with different numbers of duplicates, labelled. | +1,000 | +250 | +source | +
historical_50k |
+The data is based on historical persons scraped from wikidata. Duplicate records are introduced with a variety of errors. | +50,000 | +5,156 | +source | +
febrl3 |
+The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL3 data set contains 5000 records (2000 originals and 3000 duplicates), with a maximum of 5 duplicates based on one original record. | +5,000 | +2,000 | +source | +
febrl4a |
+The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL4a contains 5000 original records. | +5,000 | +5,000 | +source | +
febrl4b |
+The Freely Extensible Biomedical Record Linkage (FEBRL) datasets consist of comparison patterns from an epidemiological cancer study in Germany.FEBRL4b contains 5000 duplicate records, one for each record in FEBRL4a. | +5,000 | +5,000 | +source | +
transactions_origin |
+This data has been generated to resemble bank transactions leaving an account. There are no duplicates within the dataset and each transaction is designed to have a counterpart arriving in 'transactions_destination'. Memo is sometimes truncated or missing. | +45,326 | +45,326 | +source | +
transactions_destination |
+This data has been generated to resemble bank transactions arriving in an account. There are no duplicates within the dataset and each transaction is designed to have a counterpart sent from 'transactions_origin'. There may be a delay between the source and destination account, and the amount may vary due to hidden fees and foreign exchange rates. Memo is sometimes truncated or missing. | +45,326 | +45,326 | +source | +
splink_dataset_labels
¶Some of the splink_datasets
have corresponding clerical labels to help assess model performance. These are requested through the splink_dataset_labels
module.
The datasets available are listed below:
+dataset name | +description | +rows | +unique entities | +link to source | +
---|---|---|---|---|
fake_1000_labels |
+Clerical labels for fake_1000 | +3,176 | +NA | +source | +
splink_dataset_utils
API¶In addition to splink_datasets
, you can also import splink_dataset_utils
,
+which has a few functions to help managing splink_datasets
.
+This can be useful if you have limited internet connection and want to see what is already cached,
+or if you need to clear cache items (e.g. if datasets were to be updated, or if space is an issue).
For example: +
from splink.datasets import splink_dataset_utils
+
+splink_dataset_utils.show_downloaded_data()
+splink_dataset_utils.clear_cache(['fake_1000'])
+
list_downloaded_datasets()
+
+¶Return a list of datasets that have already been pre-downloaded
+ +list_all_datasets()
+
+¶Return a list of all available datasets, regardless of whether +or not they have already been pre-downloaded
+ +show_downloaded_data()
+
+¶Print a list of datasets that have already been pre-downloaded
+ +clear_downloaded_data(datasets=None)
+
+¶Delete any pre-downloaded data stored locally.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
datasets |
+
+ list
+ |
+
+
+
+ A list of dataset names (without any file suffix)
+to delete.
+If |
+
+ None
+ |
+
EMTrainingSession
¶linker.training.estimate_parameters_using_expectation_maximisation
returns an object of type EMTrainingSession
which has the following methods:
Manages training models using the Expectation Maximisation algorithm, and +holds statistics on the evolution of parameter estimates. Plots diagnostic charts
+ + + + +probability_two_random_records_match_iteration_chart()
+
+¶Display a chart showing the iteration history of the probability that two +random records match.
+ + +Returns:
+Type | +Description | +
---|---|
+ ChartReturnType
+ |
+
+
+
+ An interactive Altair chart. + |
+
match_weights_interactive_history_chart()
+
+¶Display an interactive chart of the match weights history.
+ + +Returns:
+Type | +Description | +
---|---|
+ ChartReturnType
+ |
+
+
+
+ An interactive Altair chart. + |
+
m_u_values_interactive_history_chart()
+
+¶Display an interactive chart of the m and u values.
+ + +Returns:
+Type | +Description | +
---|---|
+ ChartReturnType
+ |
+
+
+
+ An interactive Altair chart. + |
+
Evaluate the performance of a Splink model. Accessed via
+linker.evaluation
prediction_errors_from_labels_table(labels_splinkdataframe_or_table_name, include_false_positives=True, include_false_negatives=True, threshold_match_probability=0.5)
+
+¶Find false positives and false negatives based on the comparison between the
+clerical_match_score
in the labels table compared with the splink predicted
+match probability
The table of labels should be in the following format, and should be registered +as a table with your database using
+labels_table = linker.table_management.register_labels_table(my_df)
source_dataset_l | +unique_id_l | +source_dataset_r | +unique_id_r | +clerical_match_score | +
---|---|---|---|---|
df_1 | +1 | +df_2 | +2 | +0.99 | +
df_1 | +1 | +df_2 | +3 | +0.2 | +
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
labels_splinkdataframe_or_table_name |
+
+ str | SplinkDataFrame
+ |
+
+
+
+ Name of table +containing labels in the database + |
+ + required + | +
include_false_positives |
+
+ bool
+ |
+
+
+
+ Defaults to True. + |
+
+ True
+ |
+
include_false_negatives |
+
+ bool
+ |
+
+
+
+ Defaults to True. + |
+
+ True
+ |
+
threshold_match_probability |
+
+ float
+ |
+
+
+
+ Threshold probability +above which a prediction considered to be a match. Defaults to 0.5. + |
+
+ 0.5
+ |
+
Examples:
+labels_table = linker.table_management.register_labels_table(df_labels)
+
+linker.evaluation.prediction_errors_from_labels_table(
+ labels_table, include_false_negatives=True, include_false_positives=False
+).as_pandas_dataframe()
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ Table containing false positives and negatives + |
+
accuracy_analysis_from_labels_column(labels_column_name, *, threshold_match_probability=0.5, match_weight_round_to_nearest=0.1, output_type='threshold_selection', add_metrics=[], positives_not_captured_by_blocking_rules_scored_as_zero=True)
+
+¶Generate an accuracy chart or table from ground truth data, where the ground
+truth is in a column in the input dataset called labels_column_name
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
labels_column_name |
+
+ str
+ |
+
+
+
+ Column name containing labels in the input table + |
+ + required + | +
threshold_match_probability |
+
+ float
+ |
+
+
+
+ Where the
+ |
+
+ 0.5
+ |
+
match_weight_round_to_nearest |
+
+ float
+ |
+
+
+
+ When provided, thresholds +are rounded. When large numbers of labels are provided, this is +sometimes necessary to reduce the size of the ROC table, and therefore +the number of points plotted on the chart. Defaults to None. + |
+
+ 0.1
+ |
+
add_metrics |
+
+ list(str)
+ |
+
+
+
+ Precision and recall metrics are always
+included. Where provided,
|
+
+ []
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
chart |
+ Union[ChartReturnType, SplinkDataFrame]
+ |
+
+
+
+ An altair chart + |
+
accuracy_analysis_from_labels_table(labels_splinkdataframe_or_table_name, *, threshold_match_probability=0.5, match_weight_round_to_nearest=0.1, output_type='threshold_selection', add_metrics=[])
+
+¶Generate an accuracy chart or table from labelled (ground truth) data.
+The table of labels should be in the following format, and should be registered
+as a table with your database using
+labels_table = linker.register_labels_table(my_df)
source_dataset_l | +unique_id_l | +source_dataset_r | +unique_id_r | +clerical_match_score | +
---|---|---|---|---|
df_1 | +1 | +df_2 | +2 | +0.99 | +
df_1 | +1 | +df_2 | +3 | +0.2 | +
Note that source_dataset
and unique_id
should correspond to the values
+specified in the settings dict, and the input_table_aliases
passed to the
+linker
object.
For dedupe_only
links, the source_dataset
columns can be ommitted.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
labels_splinkdataframe_or_table_name |
+
+ str | SplinkDataFrame
+ |
+
+
+
+ Name of table +containing labels in the database + |
+ + required + | +
threshold_match_probability |
+
+ float
+ |
+
+
+
+ Where the
+ |
+
+ 0.5
+ |
+
match_weight_round_to_nearest |
+
+ float
+ |
+
+
+
+ When provided, thresholds +are rounded. When large numbers of labels are provided, this is +sometimes necessary to reduce the size of the ROC table, and therefore +the number of points plotted on the chart. Defaults to None. + |
+
+ 0.1
+ |
+
add_metrics |
+
+ list(str)
+ |
+
+
+
+ Precision and recall metrics are always
+included. Where provided,
|
+
+ []
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Union[ChartReturnType, SplinkDataFrame]
+ |
+
+
+
+ altair.Chart: An altair chart + |
+
prediction_errors_from_labels_column(label_colname, include_false_positives=True, include_false_negatives=True, threshold_match_probability=0.5)
+
+¶Generate a dataframe containing false positives and false negatives +based on the comparison between the splink match probability and the +labels column. A label column is a column in the input dataset that contains +the 'ground truth' cluster to which the record belongs
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
label_colname |
+
+ str
+ |
+
+
+
+ Name of labels column in input data + |
+ + required + | +
include_false_positives |
+
+ bool
+ |
+
+
+
+ Defaults to True. + |
+
+ True
+ |
+
include_false_negatives |
+
+ bool
+ |
+
+
+
+ Defaults to True. + |
+
+ True
+ |
+
threshold_match_probability |
+
+ float
+ |
+
+
+
+ Threshold above which a score +is considered to be a match. Defaults to 0.5. + |
+
+ 0.5
+ |
+
Examples:
+linker.evaluation.prediction_errors_from_labels_column(
+ "ground_truth_cluster",
+ include_false_negatives=True,
+ include_false_positives=False
+).as_pandas_dataframe()
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ Table containing false positives and negatives + |
+
unlinkables_chart(x_col='match_weight', name_of_data_in_title=None, as_dict=False)
+
+¶Generate an interactive chart displaying the proportion of records that +are "unlinkable" for a given splink score threshold and model parameters.
+Unlinkable records are those that, even when compared with themselves, do not +contain enough information to confirm a match.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
x_col |
+
+ str
+ |
+
+
+
+ Column to use for the x-axis. +Defaults to "match_weight". + |
+
+ 'match_weight'
+ |
+
name_of_data_in_title |
+
+ str
+ |
+
+
+
+ Name of the source dataset to use for +the title of the output chart. + |
+
+ None
+ |
+
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return a dict version of the chart. + |
+
+ False
+ |
+
Examples:
+After estimating the parameters of the model, run:
+linker.evaluation.unlinkables_chart()
+
Returns:
+Type | +Description | +
---|---|
+ ChartReturnType
+ |
+
+
+
+ altair.Chart: An altair chart + |
+
labelling_tool_for_specific_record(unique_id, source_dataset=None, out_path='labelling_tool.html', overwrite=False, match_weight_threshold=-4, view_in_jupyter=False, show_splink_predictions_in_interface=True)
+
+¶Create a standalone, offline labelling dashboard for a specific record +as identified by its unique id
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
unique_id |
+
+ str
+ |
+
+
+
+ The unique id of the record for which to create the +labelling tool + |
+ + required + | +
source_dataset |
+
+ str
+ |
+
+
+
+ If there are multiple datasets, to +identify the record you must also specify the source_dataset. Defaults +to None. + |
+
+ None
+ |
+
out_path |
+
+ str
+ |
+
+
+
+ The output path for the labelling tool. Defaults +to "labelling_tool.html". + |
+
+ 'labelling_tool.html'
+ |
+
overwrite |
+
+ bool
+ |
+
+
+
+ If true, overwrite files at the output +path if they exist. Defaults to False. + |
+
+ False
+ |
+
match_weight_threshold |
+
+ int
+ |
+
+
+
+ Include possible matches in the +output which score above this threshold. Defaults to -4. + |
+
+ -4
+ |
+
view_in_jupyter |
+
+ bool
+ |
+
+
+
+ If you're viewing in the Jupyter +html viewer, set this to True to extract your labels. Defaults to False. + |
+
+ False
+ |
+
show_splink_predictions_in_interface |
+
+ bool
+ |
+
+
+
+ Whether to +show information about the Splink model's predictions that could +potentially bias the decision of the clerical labeller. Defaults to +True. + |
+
+ True
+ |
+
splink.exploratory
¶completeness_chart(table_or_tables, db_api, cols=None, table_names_for_chart=None)
+
+¶Generate a summary chart of data completeness (proportion of non-nulls) of +columns in each of the input table or tables. By default, completeness is assessed +for all columns in the input data.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
table_or_tables |
+
+ Sequence[AcceptableInputTableType]
+ |
+
+
+
+ A single table or a list of tables of data + |
+ + required + | +
db_api |
+
+ DatabaseAPISubClass
+ |
+
+
+
+ The backend database API to use + |
+ + required + | +
cols |
+
+ List[str]
+ |
+
+
+
+ List of column names to calculate completeness. If +none, all columns will be computed. Default to None. + |
+
+ None
+ |
+
table_names_for_chart |
+
+ List[str]
+ |
+
+
+
+ A list of names. Must be the same length as +table_or_tables. + |
+
+ None
+ |
+
profile_columns(table_or_tables, db_api, column_expressions=None, top_n=10, bottom_n=10)
+
+¶Profiles the specified columns of the dataframe initiated with the linker.
+This can be computationally expensive if the dataframe is large.
+For the provided columns with column_expressions (or for all columns if left empty) +calculate: +- A distribution plot that shows the count of values at each percentile. +- A top n chart, that produces a chart showing the count of the top n values +within the column +- A bottom n chart, that produces a chart showing the count of the bottom +n values within the column
+This should be used to explore the dataframe, determine if columns have +sufficient completeness for linking, analyse the cardinality of columns, and +identify the need for standardisation within a given column.
+Args:
+column_expressions (list, optional): A list of strings containing the
+ specified column names.
+ If left empty this will default to all columns.
+top_n (int, optional): The number of top n values to plot.
+bottom_n (int, optional): The number of bottom n values to plot.
+
+
+
+ Returns:
+Type | +Description | +
---|---|
+ Optional[ChartReturnType]
+ |
+
+
+
+ altair.Chart or dict: A visualization or JSON specification describing the + |
+
+ Optional[ChartReturnType]
+ |
+
+
+
+ profiling charts. + |
+
linker
object should be an instance of the initiated linker.column_expressions
can be a list of column names to profile.
+ If left empty, all columns will be profiled.top_n
and bottom_n
parameters determine the number of top and bottom
+ values to display in the respective charts.splink.exploratory.similarity_analysis
¶comparator_score(str1, str2, decimal_places=2)
+
+¶Helper function to give the similarity between two strings for +the string comparators in splink.
+ + +Examples:
+import splink.exploratory.similarity_analysis as sa
+
+sa.comparator_score("Richard", "iRchard")
+
comparator_score_chart(list, col1, col2)
+
+¶Helper function returning a heatmap showing the sting similarity +scores and string distances for a list of strings.
+ + +Examples:
+import splink.exploratory.similarity_analysis as sa
+
+list = {
+ "string1": ["Stephen", "Stephen", "Stephen"],
+ "string2": ["Stephen", "Steven", "Stephan"],
+ }
+
+sa.comparator_score_chart(list, "string1", "string2")
+
comparator_score_df(list, col1, col2, decimal_places=2)
+
+¶Helper function returning a dataframe showing the string similarity +scores and string distances for a list of strings.
+ + +Examples:
+import splink.exploratory.similarity_analysis as sa
+
+list = {
+ "string1": ["Stephen", "Stephen","Stephen"],
+ "string2": ["Stephen", "Steven", "Stephan"],
+ }
+
+sa.comparator_score_df(list, "string1", "string2")
+
comparator_score_threshold_chart(list, col1, col2, similarity_threshold=None, distance_threshold=None)
+
+¶Helper function returning a heatmap showing the string similarity +scores and string distances for a list of strings given a threshold.
+ + +Examples:
+import splink.exploratory.similarity_analysis as sa
+
+list = {
+ "string1": ["Stephen", "Stephen","Stephen"],
+ "string2": ["Stephen", "Steven", "Stephan"],
+ }
+
+sa.comparator_score_threshold_chart(data,
+ "string1", "string2",
+ similarity_threshold=0.8,
+ distance_threshold=2)
+
phonetic_match_chart(list, col1, col2)
+
+¶Helper function returning a heatmap showing the phonetic transform and +matches for a list of strings given a threshold.
+ + +Examples:
+import splink.exploratory.similarity_analysis as sa
+
+list = {
+ "string1": ["Stephen", "Stephen","Stephen"],
+ "string2": ["Stephen", "Steven", "Stephan"],
+ }
+
+sa.comparator_score_threshold_chart(list,
+ "string1", "string2",
+ similarity_threshold=0.8,
+ distance_threshold=2)
+
phonetic_transform(string)
+
+¶Helper function to give the phonetic transformation of two strings with +Soundex, Metaphone and Double Metaphone.
+ + +Examples:
+phonetic_transform("Richard", "iRchard")
+
phonetic_transform_df(list, col1, col2)
+
+¶Helper function returning a dataframe showing the phonetic transforms +for a list of strings.
+ + +Examples:
+import splink.exploratory.similarity_analysis as sa
+
+list = {
+ "string1": ["Stephen", "Stephen","Stephen"],
+ "string2": ["Stephen", "Steven", "Stephan"],
+ }
+
+sa.phonetic_match_chart(list, "string1", "string2")
+
Use your Splink model to make predictions (perform inference). Accessed via
+linker.inference
.
deterministic_link()
+
+¶Uses the blocking rules specified by
+blocking_rules_to_generate_predictions
in your settings to
+generate pairwise record comparisons.
For deterministic linkage, this should be a list of blocking rules which +are strict enough to generate only true links.
+Deterministic linkage, however, is likely to result in missed links +(false negatives).
+Examples:
+```py
+settings = SettingsCreator(
+ link_type="dedupe_only",
+ blocking_rules_to_generate_predictions=[
+ block_on("first_name", "surname"),
+ block_on("dob", "first_name"),
+ ],
+)
+
+linker = Linker(df, settings, db_api=db_api)
+splink_df = linker.inference.deterministic_link()
+```
+
+
+
+ Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ A SplinkDataFrame of the pairwise comparisons. + |
+
predict(threshold_match_probability=None, threshold_match_weight=None, materialise_after_computing_term_frequencies=True, materialise_blocked_pairs=True)
+
+¶Create a dataframe of scored pairwise comparisons using the parameters +of the linkage model.
+Uses the blocking rules specified in the
+blocking_rules_to_generate_predictions
key of the settings to
+generate the pairwise comparisons.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
threshold_match_probability |
+
+ float
+ |
+
+
+
+ If specified, +filter the results to include only pairwise comparisons with a +match_probability above this threshold. Defaults to None. + |
+
+ None
+ |
+
threshold_match_weight |
+
+ float
+ |
+
+
+
+ If specified, +filter the results to include only pairwise comparisons with a +match_weight above this threshold. Defaults to None. + |
+
+ None
+ |
+
materialise_after_computing_term_frequencies |
+
+ bool
+ |
+
+
+
+ If true, Splink +will materialise the table containing the input nodes (rows) +joined to any term frequencies which have been asked +for in the settings object. If False, this will be +computed as part of a large CTE pipeline. Defaults to True + |
+
+ True
+ |
+
materialise_blocked_pairs |
+
+ bool
+ |
+
+
+
+ In the blocking phase, materialise the table +of pairs of records that will be scored + |
+
+ True
+ |
+
Examples:
+linker = linker(df, "saved_settings.json", db_api=db_api)
+splink_df = linker.inference.predict(threshold_match_probability=0.95)
+splink_df.as_pandas_dataframe(limit=5)
+
find_matches_to_new_records(records_or_tablename, blocking_rules=[], match_weight_threshold=-4)
+
+¶Given one or more records, find records in the input dataset(s) which match +and return in order of the Splink prediction score.
+This effectively provides a way of searching the input datasets +for given record(s)
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
records_or_tablename |
+
+ List[dict]
+ |
+
+
+
+ Input search record(s) as list of dict, +or a table registered to the database. + |
+ + required + | +
blocking_rules |
+
+ list
+ |
+
+
+
+ Blocking rules to select +which records to find and score. If [], do not use a blocking +rule - meaning the input records will be compared to all records +provided to the linker when it was instantiated. Defaults to []. + |
+
+ []
+ |
+
match_weight_threshold |
+
+ int
+ |
+
+
+
+ Return matches with a match weight +above this threshold. Defaults to -4. + |
+
+ -4
+ |
+
Examples:
+linker = Linker(df, "saved_settings.json", db_api=db_api)
+
+# You should load or pre-compute tf tables for any tables with
+# term frequency adjustments
+linker.table_management.compute_tf_table("first_name")
+# OR
+linker.table_management.register_term_frequency_lookup(df, "first_name")
+
+record = {'unique_id': 1,
+ 'first_name': "John",
+ 'surname': "Smith",
+ 'dob': "1971-05-24",
+ 'city': "London",
+ 'email': "john@smith.net"
+ }
+df = linker.inference.find_matches_to_new_records(
+ [record], blocking_rules=[]
+)
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ The pairwise comparisons. + |
+
compare_two_records(record_1, record_2)
+
+¶Use the linkage model to compare and score a pairwise record comparison +based on the two input records provided
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
record_1 |
+
+ dict
+ |
+
+
+
+ dictionary representing the first record. Columns names +and data types must be the same as the columns in the settings object + |
+ + required + | +
record_2 |
+
+ dict
+ |
+
+
+
+ dictionary representing the second record. Columns names +and data types must be the same as the columns in the settings object + |
+ + required + | +
Examples:
+linker = Linker(df, "saved_settings.json", db_api=db_api)
+
+# You should load or pre-compute tf tables for any tables with
+# term frequency adjustments
+linker.table_management.compute_tf_table("first_name")
+# OR
+linker.table_management.register_term_frequency_lookup(df, "first_name")
+
+record_1 = {'unique_id': 1,
+ 'first_name': "John",
+ 'surname': "Smith",
+ 'dob': "1971-05-24",
+ 'city': "London",
+ 'email': "john@smith.net"
+ }
+
+record_2 = {'unique_id': 1,
+ 'first_name': "Jon",
+ 'surname': "Smith",
+ 'dob': "1971-05-23",
+ 'city': "London",
+ 'email': "john@smith.net"
+ }
+df = linker.inference.compare_two_records(record_1, record_2)
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ Pairwise comparison with scored prediction + |
+
Miscellaneous methods on the linker that don't fit into other categories.
+Accessed via linker.misc
.
save_model_to_json(out_path=None, overwrite=False)
+
+¶Save the configuration and parameters of the linkage model to a .json
file.
The model can later be loaded into a new linker using +`Linker(df, settings="path/to/model.json", db_api=db_api).
+The settings dict is also returned in case you want to save it a different way.
+ + +Examples:
+linker.misc.save_model_to_json("my_settings.json", overwrite=True)
+
Returns:
+Name | Type | +Description | +
---|---|---|
dict |
+ dict[str, Any]
+ |
+
+
+
+ The settings as a dictionary. + |
+
query_sql(sql, output_type='pandas')
+
+¶Run a SQL query against your backend database and return +the resulting output.
+ + +Examples:
+linker = Linker(df, settings, db_api)
+df_predict = linker.predict()
+linker.misc.query_sql(f"select * from {df_predict.physical_name} limit 10")
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
sql |
+
+ str
+ |
+
+
+
+ The SQL to be queried. + |
+ + required + | +
output_type |
+
+ str
+ |
+
+
+
+ One of splink_df/splinkdf or pandas. +This determines the type of table that your results are output in. + |
+
+ 'pandas'
+ |
+
This document enumerates all the settings and configuration options available when +developing your data linkage model.
+link_type
¶The type of data linking task. Required.
+When dedupe_only
, splink
find duplicates. User expected to provide a single input dataset.
When link_and_dedupe
, splink
finds links within and between input datasets. User is expected to provide two or more input datasets.
When link_only
, splink
finds links between datasets, but does not attempt to deduplicate the datasets (it does not try and find links within each input dataset.) User is expected to provide two or more input datasets.
Examples: ['dedupe_only', 'link_only', 'link_and_dedupe']
probability_two_random_records_match
¶The probability that two records chosen at random (with no blocking) are a match. For example, if there are a million input records and each has on average one match, then this value should be 1/1,000,000.
+If you estimate parameters using expectation maximisation (EM), this provides an initial value (prior) from which the EM algorithm will start iterating. EM will then estimate the true value of this parameter.
+Default value: 0.0001
Examples: [1e-05, 0.006]
em_convergence
¶Convergence tolerance for the Expectation Maximisation algorithm
+The algorithm will stop converging when the maximum of the change in model parameters between iterations is below this value
+Default value: 0.0001
Examples: [0.0001, 1e-05, 1e-06]
max_iterations
¶The maximum number of Expectation Maximisation iterations to run (even if convergence has not been reached)
+Default value: 25
Examples: [20, 150]
unique_id_column_name
¶Splink requires that the input dataset has a column that uniquely identifies each record. unique_id_column_name
is the name of the column in the input dataset representing this unique id
For linking tasks, ids must be unique within each dataset being linked, and do not need to be globally unique across input datasets
+Default value: unique_id
Examples: ['unique_id', 'id', 'pk']
source_dataset_column_name
¶The name of the column in the input dataset representing the source dataset
+Where we are linking datasets, we can't guarantee that the unique id column is globally unique across datasets, so we combine it with a source_dataset column. Usually, this is created by Splink for the user
+Default value: source_dataset
Examples: ['source_dataset', 'dataset_name']
retain_matching_columns
¶If set to true, each column used by the comparisons
SQL expressions will be retained in output datasets
This is helpful so that the user can inspect matches, but once the comparison vector (gamma) columns are computed, this information is not actually needed by the algorithm. The algorithm will run faster and use less resources if this is set to false.
+Default value: True
Examples: [False, True]
retain_intermediate_calculation_columns
¶Retain intermediate calculation columns, such as the Bayes factors associated with each column in comparisons
The algorithm will run faster and use less resources if this is set to false.
+Default value: False
Examples: [False, True]
A list specifying how records should be compared for probabilistic matching. Each element is a dictionary
+comparisons
The name used to refer to this comparison in the output dataset. By default, Splink will set this to the name(s) of any input columns used in the comparison. This key is most useful to give a clearer description to comparisons that use multiple input columns. e.g. a location column that uses postcode and town may be named location
+For a comparison column that uses a single input column, e.g. first_name, this will be set first_name. For comparison columns that use multiple columns, if left blank, this will be set to the concatenation of columns used.
+Examples: ['first_name', 'surname']
An optional label to describe this comparison, to be used in charting outputs.
+Examples: ['First name exact match', 'Surname with middle levenshtein level']
Comparison levels specify how input values should be compared. Each level corresponds to an assessment of similarity, such as exact match, Jaro-Winkler match, one side of the match being null, etc
+Each comparison level represents a branch of a SQL case expression. They are specified in order of evaluation, each with a sql_condition
that represents the branch of a case expression
Example: +
[{
+ "sql_condition": "first_name_l IS NULL OR first_name_r IS NULL",
+ "label_for_charts": "null",
+ "null_level": True
+},
+{
+ "sql_condition": "first_name_l = first_name_r",
+ "label_for_charts": "exact_match",
+ "tf_adjustment_column": "first_name"
+},
+{
+ "sql_condition": "ELSE",
+ "label_for_charts": "else"
+}]
+
comparison_levels
sql_condition
¶A branch of a SQL case expression without WHEN and THEN e.g. jaro_winkler_sim(surname_l, surname_r) > 0.88
Examples: ['forename_l = forename_r', 'jaro_winkler_sim(surname_l, surname_r) > 0.88']
A label for this comparison level, which will appear on charts as a reminder of what the level represents
+Examples: ['exact', 'postcode exact']
the u probability for this comparison level - i.e. the proportion of records that match this level amongst truly non-matching records
+Examples: [0.9]
the m probability for this comparison level - i.e. the proportion of records that match this level amongst truly matching records
+Examples: [0.1]
If true, m and u values will not be estimated and instead the match weight will be zero for this column. See treatment of nulls here on page 356, quote '. Under this MAR assumption, we can simply ignore missing data.': https://imai.fas.harvard.edu/research/files/linkage.pdf
+Default value: False
Make term frequency adjustments for this comparison level using this input column
+Default value: None
Examples: ['first_name', 'postcode']
Make term frequency adjustments using this weight. A weight of 1.0 is a full adjustment. A weight of 0.0 is no adjustment. A weight of 0.5 is a half adjustment
+Default value: 1.0
Examples: ['first_name', 'postcode']
Where the term frequency adjustment implies a u value below this value, use this minimum value instead
+This prevents excessive weight being assigned to very unusual terms, such as a collision on a typo
+Default value: 0.0
Examples: [0.001, 1e-09]
blocking_rules_to_generate_predictions
¶A list of one or more blocking rules to apply. A Cartesian join is applied if blocking_rules_to_generate_predictions
is empty or not supplied.
Each rule is a SQL expression representing the blocking rule, which will be used to create a join. The left table is aliased with l
and the right table is aliased with r
. For example, if you want to block on a first_name
column, the blocking rule would be
l.first_name = r.first_name
.
To block on first name and the first letter of surname, it would be
+l.first_name = r.first_name and substr(l.surname,1,1) = substr(r.surname,1,1)
.
Note that Splink deduplicates the comparisons generated by the blocking rules.
+If empty or not supplied, all comparisons between the input dataset(s) will be generated and blocking will not be used. For large input datasets, this will generally be computationally intractable because it will generate comparisons equal to the number of rows squared.
+Default value: []
Examples: [['l.first_name = r.first_name AND l.surname = r.surname', 'l.dob = r.dob']]
additional_columns_to_retain
¶A list of columns not being used in the probabilistic matching comparisons that you want to include in your results.
+By default, Splink drops columns which are not used by any comparisons. This gives you the option to retain columns which are not used by the model. A common example is if the user has labelled data (training data) and wishes to retain the labels in the outputs
+Default value: []
Examples: [['cluster', 'col_2'], ['other_information']]
bayes_factor_column_prefix
¶The prefix to use for the columns that will be created to store the Bayes factors
+Default value: bf_
Examples: ['bf_', '__bf__']
term_frequency_adjustment_column_prefix
¶The prefix to use for the columns that will be created to store the term frequency adjustments
+Default value: tf_
Examples: ['tf_', '__tf__']
comparison_vector_value_column_prefix
¶The prefix to use for the columns that will be created to store the comparison vector values
+Default value: gamma_
Examples: ['gamma_', '__gamma__']
sql_dialect
¶The SQL dialect in which sql_conditions
are written. Must be a valid SQLGlot dialect
Default value: None
Examples: ['spark', 'duckdb', 'presto', 'sqlite']
SplinkDataFrame
¶
+ Bases: ABC
Abstraction over dataframe to handle basic operations like retrieving data and
+retrieving column names, which need different implementations depending on whether
+it's a spark dataframe, sqlite table etc.
+Uses methods like as_pandas_dataframe()
and as_record_dict()
to retrieve data
as_pandas_dataframe(limit=None)
+
+¶Return the dataframe as a pandas dataframe.
+This can be computationally expensive if the dataframe is large.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
limit |
+
+ int
+ |
+
+
+
+ If provided, return this number of rows (equivalent +to a limit statement in SQL). Defaults to None, meaning return all rows + |
+
+ None
+ |
+
Examples:
+df_predict = linker.inference.predict()
+df_ten_edges = df_predict.as_pandas_dataframe(10)
+
as_record_dict(limit=None)
+
+¶Return the dataframe as a list of record dictionaries.
+This can be computationally expensive if the dataframe is large.
+ + +Examples:
+df_predict = linker.inference.predict()
+ten_edges = df_predict.as_record_dict(10)
+
Returns:
+Name | Type | +Description | +
---|---|---|
list |
+ list[dict[str, Any]]
+ |
+
+
+
+ a list of records, each of which is a dictionary + |
+
drop_table_from_database_and_remove_from_cache(force_non_splink_table=False)
+
+¶Drops the table from the underlying database, and removes it +from the (linker) cache.
+By default this will fail if the table is not one created by Splink, +but this check can be overriden
+ + +Examples:
+df_predict = linker.inference.predict()
+df_predict.drop_table_from_database_and_remove_from_cache()
+# predictions table no longer in the database / cache
+
to_csv(filepath, overwrite=False)
+
+¶Save the dataframe in csv format.
+ + +Examples:
+df_predict = linker.inference.predict()
+df_predict.to_csv("model_predictions.csv", overwrite=True)
+
to_parquet(filepath, overwrite=False)
+
+¶Save the dataframe in parquet format.
+ + +Examples:
+df_predict = linker.inference.predict()
+df_predict.to_parquet("model_predictions.parquet", overwrite=True)
+
Register Splink tables against your database backend and manage the Splink cache.
+Accessed via linker.table_management
.
compute_tf_table(column_name)
+
+¶Compute a term frequency table for a given column and persist to the database
+This method is useful if you want to pre-compute term frequency tables e.g. +so that real time linkage executes faster, or so that you can estimate +various models without having to recompute term frequency tables each time
+Examples:
+Real time linkage
+```py
+linker = Linker(df, settings="saved_settings.json", db_api=db_api)
+linker.table_management.compute_tf_table("surname")
+linker.compare_two_records(record_left, record_right)
+```
+Pre-computed term frequency tables
+```py
+linker = Linker(df, db_api)
+df_first_name_tf = linker.table_management.compute_tf_table("first_name")
+df_first_name_tf.write.parquet("folder/first_name_tf")
+>>>
+# On subsequent data linking job, read this table rather than recompute
+df_first_name_tf = pd.read_parquet("folder/first_name_tf")
+df_first_name_tf.createOrReplaceTempView("__splink__df_tf_first_name")
+```
+
+
+
+Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
column_name |
+
+ str
+ |
+
+
+
+ The column name in the input table + |
+ + required + | +
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ The resultant table as a splink data frame + |
+
invalidate_cache()
+
+¶Invalidate the Splink cache. Any previously-computed tables +will be recomputed. +This is useful, for example, if the input data tables have changed.
+ +register_table_input_nodes_concat_with_tf(input_data, overwrite=False)
+
+¶Register a pre-computed version of the input_nodes_concat_with_tf table that +you want to re-use e.g. that you created in a previous run.
+This method allows you to register this table in the Splink cache so it will be +used rather than Splink computing this table anew.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
input_data |
+
+ AcceptableInputTableType
+ |
+
+
+
+ The data you wish to register. This +can be either a dictionary, pandas dataframe, pyarrow table or a spark +dataframe. + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ Overwrite the table in the underlying database if it +exists. + |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ An abstraction representing the table created by the sql +pipeline + |
+
register_table_predict(input_data, overwrite=False)
+
+¶Register a pre-computed version of the prediction table for use in Splink.
+This method allows you to register a pre-computed prediction table in the Splink +cache so it will be used rather than Splink computing the table anew.
+ + +Examples:
+predict_df = pd.read_parquet("path/to/predict_df.parquet")
+predict_as_splinkdataframe = linker.table_management.register_table_predict(predict_df)
+clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
+ predict_as_splinkdataframe, threshold_match_probability=0.75
+)
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
input_data |
+
+ AcceptableInputTableType
+ |
+
+
+
+ The data you wish to register. This +can be either a dictionary, pandas dataframe, pyarrow table, or a spark +dataframe. + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ Overwrite the table in the underlying database +if it exists. Defaults to False. + |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame | + | +
+
+
+ An abstraction representing the table created by the SQL +pipeline. + |
+
register_term_frequency_lookup(input_data, col_name, overwrite=False)
+
+¶Register a pre-computed term frequency lookup table for a given column.
+This method allows you to register a term frequency table in the Splink +cache for a specific column. This table will then be used during linkage +rather than computing the term frequency table anew from your input data.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
input_data |
+
+ AcceptableInputTableType
+ |
+
+
+
+ The data representing the term +frequency table. This can be either a dictionary, pandas dataframe, +pyarrow table, or a spark dataframe. + |
+ + required + | +
col_name |
+
+ str
+ |
+
+
+
+ The name of the column for which the term frequency +lookup table is being registered. + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ Overwrite the table in the underlying +database if it exists. Defaults to False. + |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame | + | +
+
+
+ An abstraction representing the registered term + |
+
+ | +
+
+
+ frequency table. + |
+
Examples:
+tf_table = [
+ {"first_name": "theodore", "tf_first_name": 0.012},
+ {"first_name": "alfie", "tf_first_name": 0.013},
+]
+tf_df = pd.DataFrame(tf_table)
+linker.table_management.register_term_frequency_lookup(tf_df,
+ "first_name")
+
register_table(input_table, table_name, overwrite=False)
+
+¶Register a table to your backend database, to be used in one of the +splink methods, or simply to allow querying.
+Tables can be of type: dictionary, record level dictionary, +pandas dataframe, pyarrow table and in the spark case, a spark df.
+ + +Examples:
+test_dict = {"a": [666,777,888],"b": [4,5,6]}
+linker.table_management.register_table(test_dict, "test_dict")
+linker.query_sql("select * from test_dict")
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
input_table |
+
+ AcceptableInputTableType
+ |
+
+
+
+ The data you wish to register. This can be either a dictionary, +pandas dataframe, pyarrow table or a spark dataframe. + |
+ + required + | +
table_name |
+
+ str
+ |
+
+
+
+ The name you wish to assign to the table. + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ Overwrite the table in the underlying database if it +exists + |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
SplinkDataFrame |
+ SplinkDataFrame
+ |
+
+
+
+ An abstraction representing the table created by the sql +pipeline + |
+
Estimate the parameters of the linkage model, accessed via
+linker.training
.
estimate_probability_two_random_records_match(deterministic_matching_rules, recall, max_rows_limit=int(1000000000.0))
+
+¶Estimate the model parameter probability_two_random_records_match
using
+a direct estimation approach.
This method counts the number of matches found using deterministic rules and +divides by the total number of possible record comparisons. The recall of the +deterministic rules is used to adjust this proportion up to reflect missed +matches, providing an estimate of the probability that two random records from +the input data are a match.
+Note that if more than one deterministic rule is provided, any duplicate +pairs are automatically removed, so you do not need to worry about double +counting.
+See here +for discussion of methodology.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
deterministic_matching_rules |
+
+ list
+ |
+
+
+
+ A list of deterministic matching +rules designed to admit very few (preferably no) false positives. + |
+ + required + | +
recall |
+
+ float
+ |
+
+
+
+ An estimate of the recall the deterministic matching +rules will achieve, i.e., the proportion of all true matches these +rules will recover. + |
+ + required + | +
max_rows_limit |
+
+ int
+ |
+
+
+
+ Maximum number of rows to consider during estimation. +Defaults to 1e9. + |
+
+ int(1000000000.0)
+ |
+
Examples:
+deterministic_rules = [
+ block_on("forename", "dob"),
+ "l.forename = r.forename and levenshtein(r.surname, l.surname) <= 2",
+ block_on("email")
+]
+linker.training.estimate_probability_two_random_records_match(
+ deterministic_rules, recall=0.8
+)
+
estimate_u_using_random_sampling(max_pairs=1000000.0, seed=None)
+
+¶Estimate the u parameters of the linkage model using random sampling.
+The u parameters estimate the proportion of record comparisons that fall +into each comparison level amongst truly non-matching records.
+This procedure takes a sample of the data and generates the cartesian +product of pairwise record comparisons amongst the sampled records. +The validity of the u values rests on the assumption that the resultant +pairwise comparisons are non-matches (or at least, they are very unlikely to be +matches). For large datasets, this is typically true.
+The results of estimate_u_using_random_sampling, and therefore an entire splink +model, can be made reproducible by setting the seed parameter. Setting the seed +will have performance implications as additional processing is required.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
max_pairs |
+
+ int
+ |
+
+
+
+ The maximum number of pairwise record comparisons to +sample. Larger will give more accurate estimates but lead to longer +runtimes. In our experience at least 1e9 (one billion) gives best +results but can take a long time to compute. 1e7 (ten million) +is often adequate whilst testing different model specifications, before +the final model is estimated. + |
+
+ 1000000.0
+ |
+
seed |
+
+ int
+ |
+
+
+
+ Seed for random sampling. Assign to get reproducible u +probabilities. Note, seed for random sampling is only supported for +DuckDB and Spark, for Athena and SQLite set to None. + |
+
+ None
+ |
+
Examples:
+linker.training.estimate_u_using_random_sampling(max_pairs=1e8)
+
Returns:
+Name | Type | +Description | +
---|---|---|
Nothing |
+ None
+ |
+
+
+
+ Updates the estimated u parameters within the linker object and +returns nothing. + |
+
estimate_parameters_using_expectation_maximisation(blocking_rule, estimate_without_term_frequencies=False, fix_probability_two_random_records_match=False, fix_m_probabilities=False, fix_u_probabilities=True, populate_probability_two_random_records_match_from_trained_values=False)
+
+¶Estimate the parameters of the linkage model using expectation maximisation.
+By default, the m probabilities are estimated, but not the u probabilities,
+because good estimates for the u probabilities can be obtained from
+linker.training.estimate_u_using_random_sampling()
. You can change this by
+setting fix_u_probabilities
to False.
The blocking rule provided is used to generate pairwise record comparisons. +Usually, this should be a blocking rule that results in a dataframe where +matches are between about 1% and 99% of the blocked comparisons.
+By default, m parameters are estimated for all comparisons except those which +are included in the blocking rule.
+For example, if the blocking rule is block_on("first_name")
, then
+parameter estimates will be made for all comparison except those which use
+first_name
in their sql_condition
By default, the probability two random records match is allowed to vary +during EM estimation, but is not saved back to the model. See +this PR for +the rationale.
+ + +Examples:
+Default behaviour +
br_training = block_on("first_name", "dob")
+linker.training.estimate_parameters_using_expectation_maximisation(
+ br_training
+)
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
blocking_rule |
+
+ BlockingRuleCreator | str
+ |
+
+
+
+ The blocking rule used to +generate pairwise record comparisons. + |
+ + required + | +
estimate_without_term_frequencies |
+
+ bool
+ |
+
+
+
+ If True, the iterations +of the EM algorithm ignore any term frequency adjustments and only +depend on the comparison vectors. This allows the EM algorithm to run +much faster, but the estimation of the parameters will change slightly. + |
+
+ False
+ |
+
fix_probability_two_random_records_match |
+
+ bool
+ |
+
+
+
+ If True, do not +update the probability two random records match after each iteration. +Defaults to False. + |
+
+ False
+ |
+
fix_m_probabilities |
+
+ bool
+ |
+
+
+
+ If True, do not update the m +probabilities after each iteration. Defaults to False. + |
+
+ False
+ |
+
fix_u_probabilities |
+
+ bool
+ |
+
+
+
+ If True, do not update the u +probabilities after each iteration. Defaults to True. + |
+
+ True
+ |
+
populate_prob... |
+
+ (bool, optional)
+ |
+
+
+
+ The full name of this parameter is +populate_probability_two_random_records_match_from_trained_values. If +True, derive this parameter from the blocked value. Defaults to False. + |
+ + required + | +
Examples:
+blocking_rule = block_on("first_name", "surname")
+linker.training.estimate_parameters_using_expectation_maximisation(
+ blocking_rule
+)
+
Returns:
+Name | Type | +Description | +
---|---|---|
EMTrainingSession |
+ EMTrainingSession
+ |
+
+
+
+ An object containing information about the training +session such as how parameters changed during the iteration history + |
+
estimate_m_from_pairwise_labels(labels_splinkdataframe_or_table_name)
+
+¶Estimate the m probabilities of the linkage model from a dataframe of +pairwise labels.
+The table of labels should be in the following format, and should +be registered with your database:
+source_dataset_l | +unique_id_l | +source_dataset_r | +unique_id_r | +
---|---|---|---|
df_1 | +1 | +df_2 | +2 | +
df_1 | +1 | +df_2 | +3 | +
Note that source_dataset
and unique_id
should correspond to the
+values specified in the settings dict, and the input_table_aliases
+passed to the linker
object. Note that at the moment, this method does
+not respect values in a clerical_match_score
column. If provided, these
+are ignored and it is assumed that every row in the table of labels is a score
+of 1, i.e. a perfect match.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
labels_splinkdataframe_or_table_name |
+
+ str
+ |
+
+
+
+ Name of table containing labels +in the database or SplinkDataframe + |
+ + required + | +
Examples:
+pairwise_labels = pd.read_csv("./data/pairwise_labels_to_estimate_m.csv")
+
+linker.table_management.register_table(
+ pairwise_labels, "labels", overwrite=True
+)
+
+linker.training.estimate_m_from_pairwise_labels("labels")
+
estimate_m_from_label_column(label_colname)
+
+¶Estimate the m parameters of the linkage model from a label (ground truth) +column in the input dataframe(s).
+The m parameters represent the proportion of record comparisons that fall +into each comparison level amongst truly matching records.
+The ground truth column is used to generate pairwise record comparisons +which are then assumed to be matches.
+For example, if the entity being matched is persons, and your input dataset(s) +contain social security number, this could be used to estimate the m values +for the model.
+Note that this column does not need to be fully populated. A common case is +where a unique identifier such as social security number is only partially +populated.
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
label_colname |
+
+ str
+ |
+
+
+
+ The name of the column containing the ground truth +label in the input data. + |
+ + required + | +
Examples:
+linker.training.estimate_m_from_label_column("social_security_number")
+
Returns:
+Name | Type | +Description | +
---|---|---|
Nothing |
+ None
+ |
+
+
+
+ Updates the estimated m parameters within the linker object. + |
+
Visualisations to help you understand and diagnose your linkage model.
+Accessed via linker.visualisations
.
Most of the visualisations return an altair.Chart +object, meaning it can be saved an manipulated using Altair.
+For example:
+altair_chart = linker.visualisations.match_weights_chart()
+
+# Save to various formats
+altair_chart.save("mychart.png")
+altair_chart.save("mychart.html")
+altair_chart.save("mychart.svg")
+altair_chart.save("mychart.json")
+
+# Get chart spec as dict
+altair_chart.to_dict()
+
To save the chart as a self-contained html file with all scripts +inlined so it can be viewed offline:
+from splink.internals.charts import save_offline_chart
+c = linker.visualisations.match_weights_chart()
+save_offline_chart(c.to_dict(), "test_chart.html")
+
View resultant html file in Jupyter (or just load it in your browser)
+from IPython.display import IFrame
+IFrame(src="./test_chart.html", width=1000, height=500)
+
match_weights_chart(as_dict=False)
+
+¶Display a chart of the (partial) match weights of the linkage model
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return the chart as a dictionary. + |
+
+ False
+ |
+
Examples:
+altair_chart = linker.visualisations.match_weights_chart()
+altair_chart.save("mychart.png")
+
m_u_parameters_chart(as_dict=False)
+
+¶Display a chart of the m and u parameters of the linkage model
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return the chart as a dictionary. + |
+
+ False
+ |
+
Examples:
+altair_chart = linker.visualisations.m_u_parameters_chart()
+altair_chart.save("mychart.png")
+
Returns:
+Name | Type | +Description | +
---|---|---|
altair_chart |
+ ChartReturnType
+ |
+
+
+
+ An altair chart + |
+
match_weights_histogram(df_predict, target_bins=30, width=600, height=250, as_dict=False)
+
+¶Generate a histogram that shows the distribution of match weights in
+df_predict
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df_predict |
+
+ SplinkDataFrame
+ |
+
+
+
+ Output of |
+ + required + | +
target_bins |
+
+ int
+ |
+
+
+
+ Target number of bins in histogram. Defaults to +30. + |
+
+ 30
+ |
+
width |
+
+ int
+ |
+
+
+
+ Width of output. Defaults to 600. + |
+
+ 600
+ |
+
height |
+
+ int
+ |
+
+
+
+ Height of output chart. Defaults to 250. + |
+
+ 250
+ |
+
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return the chart as a dictionary. + |
+
+ False
+ |
+
Examples:
+df_predict = linker.inference.predict(threshold_match_weight=-2)
+linker.visualisations.match_weights_histogram(df_predict)
+
parameter_estimate_comparisons_chart(include_m=True, include_u=False, as_dict=False)
+
+¶Show a chart that shows how parameter estimates have differed across +the different estimation methods you have used.
+For example, if you have run two EM estimation sessions, blocking on +different variables, and both result in parameter estimates for +first_name, this chart will enable easy comparison of the different +estimates
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
include_m |
+
+ bool
+ |
+
+
+
+ Show different estimates of m values. Defaults +to True. + |
+
+ True
+ |
+
include_u |
+
+ bool
+ |
+
+
+
+ Show different estimates of u values. Defaults +to False. + |
+
+ False
+ |
+
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return the chart as a dictionary. + |
+
+ False
+ |
+
Examples:
+linker.training.estimate_parameters_using_expectation_maximisation(
+ blocking_rule=block_on("first_name"),
+)
+
+linker.training.estimate_parameters_using_expectation_maximisation(
+ blocking_rule=block_on("surname"),
+)
+
+linker.visualisations.parameter_estimate_comparisons_chart()
+
Returns:
+Name | Type | +Description | +
---|---|---|
altair_chart |
+ ChartReturnType
+ |
+
+
+
+ An Altair chart + |
+
tf_adjustment_chart(output_column_name, n_most_freq=10, n_least_freq=10, vals_to_include=None, as_dict=False)
+
+¶Display a chart showing the impact of term frequency adjustments on a +specific comparison level. +Each value
+ + +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
output_column_name |
+
+ str
+ |
+
+
+
+ Name of an output column for which term frequency + adjustment has been applied. + |
+ + required + | +
n_most_freq |
+
+ int
+ |
+
+
+
+ Number of most frequent values to show. If this
+ or |
+
+ 10
+ |
+
n_least_freq |
+
+ int
+ |
+
+
+
+ Number of least frequent values to show. If
+this or |
+
+ 10
+ |
+
vals_to_include |
+
+ list
+ |
+
+
+
+ Specific values for which to show term +sfrequency adjustments. +Defaults to None. + |
+
+ None
+ |
+
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return the chart as a dictionary. + |
+
+ False
+ |
+
Examples:
+linker.visualisations.tf_adjustment_chart("first_name")
+
Returns:
+Name | Type | +Description | +
---|---|---|
altair_chart |
+ ChartReturnType
+ |
+
+
+
+ An Altair chart + |
+
waterfall_chart(records, filter_nulls=True, remove_sensitive_data=False, as_dict=False)
+
+¶Visualise how the final match weight is computed for the provided pairwise +record comparisons.
+Records must be provided as a list of dictionaries. This would usually be
+obtained from df.as_record_dict(limit=n)
where df
is a SplinkDataFrame.
Examples:
+df = linker.inference.predict(threshold_match_weight=2)
+records = df.as_record_dict(limit=10)
+linker.visualisations.waterfall_chart(records)
+
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
records |
+
+ List[dict]
+ |
+
+
+
+ Usually be obtained from |
+ + required + | +
filter_nulls |
+
+ bool
+ |
+
+
+
+ Whether the visualisation shows null +comparisons, which have no effect on final match weight. Defaults to +True. + |
+
+ True
+ |
+
remove_sensitive_data |
+
+ bool
+ |
+
+
+
+ When True, The waterfall chart will +contain match weights only, and all of the (potentially sensitive) data +from the input tables will be removed prior to the chart being created. + |
+
+ False
+ |
+
as_dict |
+
+ bool
+ |
+
+
+
+ If True, return the chart as a dictionary. + |
+
+ False
+ |
+
Returns:
+Name | Type | +Description | +
---|---|---|
altair_chart |
+ ChartReturnType
+ |
+
+
+
+ An Altair chart + |
+
comparison_viewer_dashboard(df_predict, out_path, overwrite=False, num_example_rows=2, return_html_as_string=False)
+
+¶Generate an interactive html visualization of the linker's predictions and
+save to out_path
. For more information see
+this video
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df_predict |
+
+ SplinkDataFrame
+ |
+
+
+
+ The outputs of |
+ + required + | +
out_path |
+
+ str
+ |
+
+
+
+ The path (including filename) to save the html file to. + |
+ + required + | +
overwrite |
+
+ bool
+ |
+
+
+
+ Overwrite the html file if it already exists? +Defaults to False. + |
+
+ False
+ |
+
num_example_rows |
+
+ int
+ |
+
+
+
+ Number of example rows per comparison +vector. Defaults to 2. + |
+
+ 2
+ |
+
return_html_as_string |
+
+ bool
+ |
+
+
+
+ If True, return the html as a string + |
+
+ False
+ |
+
Examples:
+df_predictions = linker.predict()
+linker.visualisations.comparison_viewer_dashboard(
+ df_predictions, "scv.html", True, 2
+)
+
Optionally, in Jupyter, you can display the results inline +Otherwise you can just load the html file in your browser
+from IPython.display import IFrame
+IFrame(src="./scv.html", width="100%", height=1200)
+
cluster_studio_dashboard(df_predict, df_clustered, out_path, sampling_method='random', sample_size=10, cluster_ids=None, cluster_names=None, overwrite=False, return_html_as_string=False, _df_cluster_metrics=None)
+
+¶Generate an interactive html visualization of the predicted cluster and
+save to out_path
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df_predict |
+
+ SplinkDataFrame
+ |
+
+
+
+ The outputs of |
+ + required + | +
df_clustered |
+
+ SplinkDataFrame
+ |
+
+
+
+ The outputs of
+ |
+ + required + | +
out_path |
+
+ str
+ |
+
+
+
+ The path (including filename) to save the html file to. + |
+ + required + | +
sampling_method |
+
+ str
+ |
+
+
+
+
|
+
+ 'random'
+ |
+
sample_size |
+
+ int
+ |
+
+
+
+ Number of clusters to show in the dahboard. +Defaults to 10. + |
+
+ 10
+ |
+
cluster_ids |
+
+ list
+ |
+
+
+
+ The IDs of the clusters that will be displayed in the
+dashboard. If provided, ignore the |
+
+ None
+ |
+
overwrite |
+
+ bool
+ |
+
+
+
+ Overwrite the html file if it already exists? +Defaults to False. + |
+
+ False
+ |
+
cluster_names |
+
+ list
+ |
+
+
+
+ If provided, the dashboard will display
+these names in the selection box. Ony works in conjunction with
+ |
+
+ None
+ |
+
return_html_as_string |
+
+ bool
+ |
+
+
+
+ If True, return the html as a string + |
+
+ False
+ |
+
Examples:
+df_p = linker.inference.predict()
+df_c = linker.visualisations.cluster_pairwise_predictions_at_threshold(
+ df_p, 0.5
+)
+
+linker.cluster_studio_dashboard(
+ df_p, df_c, [0, 4, 7], "cluster_studio.html"
+)
+
Optionally, in Jupyter, you can display the results inline +Otherwise you can just load the html file in your browser
+from IPython.display import IFrame
+IFrame(src="./cluster_studio.html", width="100%", height=1200)
+
{"use strict";/*!
+ * escape-html
+ * Copyright(c) 2012-2013 TJ Holowaychuk
+ * Copyright(c) 2015 Andreas Lubbe
+ * Copyright(c) 2015 Tiancheng "Timothy" Gu
+ * MIT Licensed
+ */var Wa=/["'&<>]/;Vn.exports=Ua;function Ua(e){var t=""+e,r=Wa.exec(t);if(!r)return t;var o,n="",i=0,s=0;for(i=r.index;i