Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions kardia-app/modules/base/dups/cluster_params.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
$Version=2$
duplicate_checking_globals "system/query"
{
// Computes parameters for clustering and searching, based on the provided
// size of the data.

num_data "query/parameter" { type = integer; style = strnull; min = 1; }

sql = "
SELECT
-- Compute which clustering algorithm to use (no clustering is better for small amounts of data).
algorithm = condition(:parameters:num_data > 50000, 'kmeans', 'none'),

-- Compute the k value for clustering.
k = constrain(convert(integer, power(log(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL))
FROM
/apps/kardia/data/Kardia_DB/_a_alphabet/rows
LIMIT 1
;
";
}
68 changes: 68 additions & 0 deletions kardia-app/modules/base/dups/dups.cluster
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
$Version=2$
cluster_dups "system/cluster"
{
// Declare parameters.
algorithm "cluster/parameter" { type = string; style = notnull; }
k "cluster/parameter" { type = integer; style = notnull; }
field "cluster/parameter" { type = string; style = notnull; }
data "cluster/parameter" { type = string; style = notnull; }

// Declare data source.
source = runserver('/apps/kardia/modules/base/dups/get/' + :parameters:field + '.qy');
key_attr = "key";
data_attr = runserver(:parameters:data);

// A cluster for searching with clustering.
kmeans "cluster/cluster"
{
algorithm = "k-means";
similarity_measure = "cosine";
num_clusters = runserver(:parameters:k);
min_improvement = 0.0001;
max_iterations = 32;
}

// A "cluster" for searching without clustering.
none "cluster/cluster"
{
algorithm = "none";
}

// Default duplicate search, used for names, emails, and addresses.
dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "cosine";
threshold = 0.7;
}

// Double Metaphone search.
// Double Metaphone is prone to false positives, so it uses a higher
// threshold to make them slightly less bad.
meta_dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "levenshtein";
threshold = 0.8;
}

// Phone search.
// Searching for duplicate phone numbers uses edit distance similarity
// instead of cosine to give more accurate results.
phone_dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "levenshtein";
threshold = 0.7;
}

// Concat search.
// The concatenation strategy is faster than the aggregation strategy, so
// we can use a slightly lower threshold to detect just a few more dups.
concat_dups "cluster/search"
{
source = runserver(:parameters:algorithm);
similarity_measure = "cosine";
threshold = 0.65;
}
}
35 changes: 35 additions & 0 deletions kardia-app/modules/base/dups/get/addresses.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
$Version=2$
get_addresses "system/query"
{
// To get the address, we need to concatenate several fields found in the
// p_location table:
// - p_in_care_of: For sending mail to a recipient without an address who
// will have a different person/organization receive the
// mail for them. The address of that entity is provided.
// - p_address_1, p_address_2, & p_address_3: Up to 3 lines of an address.
// - p_city, p_state_province: The city and state (respectively).
// - p_country_code, p_postal_code: The country and postal code (repsectively).
//
// If the p_address_1 field is null or does not exist, the 'address' is ignored.
// This is very common because many systems in Centrallix assume that every
// record has an address, so every record has an associated address, even if it
// is almost completely blank.
sql = "
SELECT
key = :p_partner_key,
address = ''
+ isnull(:p_in_care_of, '')
+ isnull(:p_address_1, '')
+ isnull(:p_address_2, '')
+ isnull(:p_address_3, '')
+ isnull(:p_city, '')
+ isnull(:p_state_province, '')
+ isnull(:p_country_code, '')
+ isnull(:p_postal_code, '')
FROM
identity /apps/kardia/data/Kardia_DB/p_location/rows
WHERE
char_length(isnull(:p_address_1, '')) > 1
;
";
}
87 changes: 87 additions & 0 deletions kardia-app/modules/base/dups/get/concats.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
$Version=2$
get_concats "system/query"
{
// All data found in the files names.qy, name_metas.qy, emails.qy,
// phones.qy, and addresses.qy is concatenated together here (although
// those files aren't read here for performance reasons), producing a
// single concatenated string with all the information for a person.
// This string is produced for every record in p_partner record in the
// database. Each record can have up to one email, phone number, and
// address, so multiple records are produced for a given p_partner record
// if various combonations of this contact information are possible.
//
// Note: Don't make the mistake of calling an attribute "name" or it might
// accidentally become the canonical name of that object in the object
// system, causing a ton of stuff to break in subtle and confusing ways.
sql = "
declare collection temp;

INSERT INTO
collection temp
SELECT
key = :p:p_partner_key,
name_str = ''
+ isnull(:p_given_name, '')
+ isnull(condition(
char_length(isnull(:p_given_name, '')) > 1
AND char_length(isnull(:p_surname, '')) > 1
AND :p_given_name != :p_preferred_name
AND :p_surname != :p_preferred_name,
:p_preferred_name,
''
), '')
+ isnull(:p_surname, '')
+ isnull(:p_org_name, ''),
name_meta = ''
+ isnull(metaphone(:p_given_name), '')
+ isnull(condition(
:p_given_name != :p_preferred_name
AND :p_surname != :p_preferred_name,
metaphone(:p_preferred_name),
''
), '')
+ isnull(metaphone(:p_surname), ''),
email = isnull(:e:p_contact_data, ''),
phone = ''
+ isnull(:ph:p_phone_country, '')
+ isnull(:ph:p_phone_area_city, '')
+ isnull(:ph:p_contact_data, ''),
address = ''
+ isnull(:l:p_in_care_of, '')
+ isnull(:l:p_address_1, '')
+ isnull(:l:p_address_2, '')
+ isnull(:l:p_address_3, '')
+ isnull(:l:p_city, '')
+ isnull(:l:p_state_province, '')
+ isnull(:l:p_country_code, '')
+ isnull(:l:p_postal_code, '')
FROM
identity /apps/kardia/data/Kardia_DB/p_partner/rows p,
/apps/kardia/data/Kardia_DB/p_contact_info/rows e,
/apps/kardia/data/Kardia_DB/p_contact_info/rows ph,
/apps/kardia/data/Kardia_DB/p_location/rows l
WHERE
:p:p_partner_key *= :e:p_partner_key
AND :p:p_partner_key *= :ph:p_partner_key
AND :p:p_partner_key *= :l:p_partner_key
AND :e:p_contact_type = 'E'
AND ((:ph:p_contact_type = 'P') + (:ph:p_contact_type = 'C'))
;

-- Nonzero numbers are used as boundary markers for the meta parts
-- because they do not appear in metaphones. This helps to reduce
-- false positives from boundary characters falsely matching.
SELECT
key = :key,
data = ''
+ :name_str + '`'
+ :name_meta + '1'
+ :name_meta + '1'
+ :name_meta + '1'
+ :email + '`'
+ :phone + '`'
+ :address
FROM
collection temp
";
}
19 changes: 19 additions & 0 deletions kardia-app/modules/base/dups/get/emails.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
$Version=2$
get_emails "system/query"
{
// The email field is incredibly simple to get. We simply query for all
// p_contact_info records with a p_contact_type of E (for email) and read
// the email directly from the p_contact_data field. If this field is null
// or does not exist, the 'email' is ignored.
sql = "
SELECT
key = :p_partner_key,
email = :p_contact_data
FROM
identity /apps/kardia/data/Kardia_DB/p_contact_info/rows
WHERE
:p_contact_type = 'E'
AND char_length(isnull(:p_contact_data, '')) > 1
;
";
}
32 changes: 32 additions & 0 deletions kardia-app/modules/base/dups/get/name_metas.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
$Version=2$
get_name_metas "system/query"
{
// We calculate the metaphone value separately for the p_given_name,
// p_preferred_name, and p_surname fields. As with names, the
// p_preferred_name field is ignored if it matches the given name or
// the surname.
//
// Note: p_org_name is not considered because we determined that it was
// unlikely for someone would have to guess the spelling of an
// organization's name from how it sounded. Also, such names tend
// to be long and ill suited for the double metaphone algorithm.
sql = "
SELECT
key = :p_partner_key,
name_meta = ''
+ isnull(metaphone(:p_given_name), '')
+ isnull(condition(
:p_preferred_name != :p_given_name
AND :p_preferred_name != :p_surname,
metaphone(:p_preferred_name),
''
), '')
+ isnull(metaphone(:p_surname), '')
FROM
/apps/kardia/data/Kardia_DB/p_partner/rows
WHERE
char_length(isnull(:p_given_name, '')) > 1
AND char_length(isnull(:p_surname, '')) > 1
;
";
}
35 changes: 35 additions & 0 deletions kardia-app/modules/base/dups/get/name_strs.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
$Version=2$
get_names "system/query"
{
// We concatenate four fields to produce the name: p_given_name,
// p_preferred_name, p_surname, and p_org_name. p_preferred_name is
// ignored if it adds no new information because it matches the given
// name or the surname, and it is also ignored for organizations. We
// detect organizations because they do not have a p_given_name or a
// p_surname value and only the p_org_name field is considered for them
// (it's long enough on it's own, anyway). However, p_org_name is still
// considered for people, too.
//
// Note: Don't make the mistake of calling an attribute "name" or it might
// accidentally become the canonical name of that object in the object
// system, causing a ton of stuff to break in subtle and confusing ways.
sql = "
SELECT
key = :p_partner_key,
name_str = ''
+ isnull(:p_given_name, '')
+ isnull(condition(
char_length(isnull(:p_given_name, '')) > 1
AND char_length(isnull(:p_surname, '')) > 1
AND :p_given_name != :p_preferred_name
AND :p_surname != :p_preferred_name,
:p_preferred_name,
''
), '')
+ isnull(:p_surname, '')
+ isnull(:p_org_name, '')
FROM
/apps/kardia/data/Kardia_DB/p_partner/rows
;
";
}
25 changes: 25 additions & 0 deletions kardia-app/modules/base/dups/get/phones.qy
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
$Version=2$
get_phones "system/query"
{
// The phone field can be found by querying for all p_contact_info records
// with a p_contact_type of P (for phone) or C (for cellphone), both of
// which we treat the same for the purposes of this algorithm. Then, we
// read the country code / area code from the p_phone_country field, the
// second 3digit code from the p_phone_area_city field, and finally the
// last four digits form the p_contact_data field. If this field is null
// or does not exist, the 'phone number' is ignored.
sql = "
SELECT
key = :p_partner_key,
phone = ''
+ isnull(:p_phone_country, '')
+ isnull(:p_phone_area_city, '')
+ isnull(:p_contact_data, '')
FROM
identity /apps/kardia/data/Kardia_DB/p_contact_info/rows
WHERE
((:p_contact_type = 'P') + (:p_contact_type = 'C'))
AND char_length(isnull(:p_contact_data, '')) > 1
;
";
}
Loading