diff --git a/kardia-app/modules/base/dups/cluster_params.qy b/kardia-app/modules/base/dups/cluster_params.qy new file mode 100644 index 00000000..5d5a9143 --- /dev/null +++ b/kardia-app/modules/base/dups/cluster_params.qy @@ -0,0 +1,21 @@ +$Version=2$ +duplicate_checking_globals "system/query" + { + // Computes parameters for clustering and searching, based on the provided + // size of the data. + + num_data "query/parameter" { type = integer; style = strnull; min = 1; } + + sql = " + SELECT + -- Compute which clustering algorithm to use (no clustering is better for small amounts of data). + algorithm = condition(:parameters:num_data > 50000, 'kmeans', 'none'), + + -- Compute the k value for clustering. + k = constrain(convert(integer, power(log(:parameters:num_data, 36), 3.2) - 8), 2, convert(integer, NULL)) + FROM + /apps/kardia/data/Kardia_DB/_a_alphabet/rows + LIMIT 1 + ; + "; + } diff --git a/kardia-app/modules/base/dups/dups.cluster b/kardia-app/modules/base/dups/dups.cluster new file mode 100644 index 00000000..016376ec --- /dev/null +++ b/kardia-app/modules/base/dups/dups.cluster @@ -0,0 +1,68 @@ +$Version=2$ +cluster_dups "system/cluster" + { + // Declare parameters. + algorithm "cluster/parameter" { type = string; style = notnull; } + k "cluster/parameter" { type = integer; style = notnull; } + field "cluster/parameter" { type = string; style = notnull; } + data "cluster/parameter" { type = string; style = notnull; } + + // Declare data source. + source = runserver('/apps/kardia/modules/base/dups/get/' + :parameters:field + '.qy'); + key_attr = "key"; + data_attr = runserver(:parameters:data); + + // A cluster for searching with clustering. + kmeans "cluster/cluster" + { + algorithm = "k-means"; + similarity_measure = "cosine"; + num_clusters = runserver(:parameters:k); + min_improvement = 0.0001; + max_iterations = 32; + } + + // A "cluster" for searching without clustering. + none "cluster/cluster" + { + algorithm = "none"; + } + + // Default duplicate search, used for names, emails, and addresses. + dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "cosine"; + threshold = 0.7; + } + + // Double Metaphone search. + // Double Metaphone is prone to false positives, so it uses a higher + // threshold to make them slightly less bad. + meta_dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "levenshtein"; + threshold = 0.8; + } + + // Phone search. + // Searching for duplicate phone numbers uses edit distance similarity + // instead of cosine to give more accurate results. + phone_dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "levenshtein"; + threshold = 0.7; + } + + // Concat search. + // The concatenation strategy is faster than the aggregation strategy, so + // we can use a slightly lower threshold to detect just a few more dups. + concat_dups "cluster/search" + { + source = runserver(:parameters:algorithm); + similarity_measure = "cosine"; + threshold = 0.65; + } + } diff --git a/kardia-app/modules/base/dups/get/addresses.qy b/kardia-app/modules/base/dups/get/addresses.qy new file mode 100644 index 00000000..e0db6797 --- /dev/null +++ b/kardia-app/modules/base/dups/get/addresses.qy @@ -0,0 +1,35 @@ +$Version=2$ +get_addresses "system/query" + { + // To get the address, we need to concatenate several fields found in the + // p_location table: + // - p_in_care_of: For sending mail to a recipient without an address who + // will have a different person/organization receive the + // mail for them. The address of that entity is provided. + // - p_address_1, p_address_2, & p_address_3: Up to 3 lines of an address. + // - p_city, p_state_province: The city and state (respectively). + // - p_country_code, p_postal_code: The country and postal code (repsectively). + // + // If the p_address_1 field is null or does not exist, the 'address' is ignored. + // This is very common because many systems in Centrallix assume that every + // record has an address, so every record has an associated address, even if it + // is almost completely blank. + sql = " + SELECT + key = :p_partner_key, + address = '' + + isnull(:p_in_care_of, '') + + isnull(:p_address_1, '') + + isnull(:p_address_2, '') + + isnull(:p_address_3, '') + + isnull(:p_city, '') + + isnull(:p_state_province, '') + + isnull(:p_country_code, '') + + isnull(:p_postal_code, '') + FROM + identity /apps/kardia/data/Kardia_DB/p_location/rows + WHERE + char_length(isnull(:p_address_1, '')) > 1 + ; + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/concats.qy b/kardia-app/modules/base/dups/get/concats.qy new file mode 100644 index 00000000..d9ae6c78 --- /dev/null +++ b/kardia-app/modules/base/dups/get/concats.qy @@ -0,0 +1,87 @@ +$Version=2$ +get_concats "system/query" + { + // All data found in the files names.qy, name_metas.qy, emails.qy, + // phones.qy, and addresses.qy is concatenated together here (although + // those files aren't read here for performance reasons), producing a + // single concatenated string with all the information for a person. + // This string is produced for every record in p_partner record in the + // database. Each record can have up to one email, phone number, and + // address, so multiple records are produced for a given p_partner record + // if various combonations of this contact information are possible. + // + // Note: Don't make the mistake of calling an attribute "name" or it might + // accidentally become the canonical name of that object in the object + // system, causing a ton of stuff to break in subtle and confusing ways. + sql = " + declare collection temp; + + INSERT INTO + collection temp + SELECT + key = :p:p_partner_key, + name_str = '' + + isnull(:p_given_name, '') + + isnull(condition( + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + AND :p_given_name != :p_preferred_name + AND :p_surname != :p_preferred_name, + :p_preferred_name, + '' + ), '') + + isnull(:p_surname, '') + + isnull(:p_org_name, ''), + name_meta = '' + + isnull(metaphone(:p_given_name), '') + + isnull(condition( + :p_given_name != :p_preferred_name + AND :p_surname != :p_preferred_name, + metaphone(:p_preferred_name), + '' + ), '') + + isnull(metaphone(:p_surname), ''), + email = isnull(:e:p_contact_data, ''), + phone = '' + + isnull(:ph:p_phone_country, '') + + isnull(:ph:p_phone_area_city, '') + + isnull(:ph:p_contact_data, ''), + address = '' + + isnull(:l:p_in_care_of, '') + + isnull(:l:p_address_1, '') + + isnull(:l:p_address_2, '') + + isnull(:l:p_address_3, '') + + isnull(:l:p_city, '') + + isnull(:l:p_state_province, '') + + isnull(:l:p_country_code, '') + + isnull(:l:p_postal_code, '') + FROM + identity /apps/kardia/data/Kardia_DB/p_partner/rows p, + /apps/kardia/data/Kardia_DB/p_contact_info/rows e, + /apps/kardia/data/Kardia_DB/p_contact_info/rows ph, + /apps/kardia/data/Kardia_DB/p_location/rows l + WHERE + :p:p_partner_key *= :e:p_partner_key + AND :p:p_partner_key *= :ph:p_partner_key + AND :p:p_partner_key *= :l:p_partner_key + AND :e:p_contact_type = 'E' + AND ((:ph:p_contact_type = 'P') + (:ph:p_contact_type = 'C')) + ; + + -- Nonzero numbers are used as boundary markers for the meta parts + -- because they do not appear in metaphones. This helps to reduce + -- false positives from boundary characters falsely matching. + SELECT + key = :key, + data = '' + + :name_str + '`' + + :name_meta + '1' + + :name_meta + '1' + + :name_meta + '1' + + :email + '`' + + :phone + '`' + + :address + FROM + collection temp + "; + } diff --git a/kardia-app/modules/base/dups/get/emails.qy b/kardia-app/modules/base/dups/get/emails.qy new file mode 100644 index 00000000..73294e77 --- /dev/null +++ b/kardia-app/modules/base/dups/get/emails.qy @@ -0,0 +1,19 @@ +$Version=2$ +get_emails "system/query" + { + // The email field is incredibly simple to get. We simply query for all + // p_contact_info records with a p_contact_type of E (for email) and read + // the email directly from the p_contact_data field. If this field is null + // or does not exist, the 'email' is ignored. + sql = " + SELECT + key = :p_partner_key, + email = :p_contact_data + FROM + identity /apps/kardia/data/Kardia_DB/p_contact_info/rows + WHERE + :p_contact_type = 'E' + AND char_length(isnull(:p_contact_data, '')) > 1 + ; + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/get/name_metas.qy b/kardia-app/modules/base/dups/get/name_metas.qy new file mode 100644 index 00000000..1491f62f --- /dev/null +++ b/kardia-app/modules/base/dups/get/name_metas.qy @@ -0,0 +1,32 @@ +$Version=2$ +get_name_metas "system/query" + { + // We calculate the metaphone value separately for the p_given_name, + // p_preferred_name, and p_surname fields. As with names, the + // p_preferred_name field is ignored if it matches the given name or + // the surname. + // + // Note: p_org_name is not considered because we determined that it was + // unlikely for someone would have to guess the spelling of an + // organization's name from how it sounded. Also, such names tend + // to be long and ill suited for the double metaphone algorithm. + sql = " + SELECT + key = :p_partner_key, + name_meta = '' + + isnull(metaphone(:p_given_name), '') + + isnull(condition( + :p_preferred_name != :p_given_name + AND :p_preferred_name != :p_surname, + metaphone(:p_preferred_name), + '' + ), '') + + isnull(metaphone(:p_surname), '') + FROM + /apps/kardia/data/Kardia_DB/p_partner/rows + WHERE + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + ; + "; + } diff --git a/kardia-app/modules/base/dups/get/name_strs.qy b/kardia-app/modules/base/dups/get/name_strs.qy new file mode 100644 index 00000000..dfb7cc02 --- /dev/null +++ b/kardia-app/modules/base/dups/get/name_strs.qy @@ -0,0 +1,35 @@ +$Version=2$ +get_names "system/query" + { + // We concatenate four fields to produce the name: p_given_name, + // p_preferred_name, p_surname, and p_org_name. p_preferred_name is + // ignored if it adds no new information because it matches the given + // name or the surname, and it is also ignored for organizations. We + // detect organizations because they do not have a p_given_name or a + // p_surname value and only the p_org_name field is considered for them + // (it's long enough on it's own, anyway). However, p_org_name is still + // considered for people, too. + // + // Note: Don't make the mistake of calling an attribute "name" or it might + // accidentally become the canonical name of that object in the object + // system, causing a ton of stuff to break in subtle and confusing ways. + sql = " + SELECT + key = :p_partner_key, + name_str = '' + + isnull(:p_given_name, '') + + isnull(condition( + char_length(isnull(:p_given_name, '')) > 1 + AND char_length(isnull(:p_surname, '')) > 1 + AND :p_given_name != :p_preferred_name + AND :p_surname != :p_preferred_name, + :p_preferred_name, + '' + ), '') + + isnull(:p_surname, '') + + isnull(:p_org_name, '') + FROM + /apps/kardia/data/Kardia_DB/p_partner/rows + ; + "; + } diff --git a/kardia-app/modules/base/dups/get/phones.qy b/kardia-app/modules/base/dups/get/phones.qy new file mode 100644 index 00000000..f06f2cb2 --- /dev/null +++ b/kardia-app/modules/base/dups/get/phones.qy @@ -0,0 +1,25 @@ +$Version=2$ +get_phones "system/query" + { + // The phone field can be found by querying for all p_contact_info records + // with a p_contact_type of P (for phone) or C (for cellphone), both of + // which we treat the same for the purposes of this algorithm. Then, we + // read the country code / area code from the p_phone_country field, the + // second 3digit code from the p_phone_area_city field, and finally the + // last four digits form the p_contact_data field. If this field is null + // or does not exist, the 'phone number' is ignored. + sql = " + SELECT + key = :p_partner_key, + phone = '' + + isnull(:p_phone_country, '') + + isnull(:p_phone_area_city, '') + + isnull(:p_contact_data, '') + FROM + identity /apps/kardia/data/Kardia_DB/p_contact_info/rows + WHERE + ((:p_contact_type = 'P') + (:p_contact_type = 'C')) + AND char_length(isnull(:p_contact_data, '')) > 1 + ; + "; + } \ No newline at end of file diff --git a/kardia-app/modules/base/dups/update.qy b/kardia-app/modules/base/dups/update.qy new file mode 100644 index 00000000..93b36f1c --- /dev/null +++ b/kardia-app/modules/base/dups/update.qy @@ -0,0 +1,667 @@ +$Version=2$ +update_duplicates "system/query" + { + // This script updates the p_dup table (/apps/kardia/data/Kardia_DB/p_dup/rows) + // by detecting duplicates using two strategies (aggregation and concatenation) + // and then upserting the results into the table. + // + // The aggregation strategy first searches for dups on several groups of fields + // which it fetches using name_str.qy, name_metas.qy, emails.qy, phones.qy, and + // addresses.qy. We invoke the dups.cluster file to get a list of dups for each + // of these field groups (name_str_dups, email_dups, etc.). Then, add each dup + // found by the other strategies to name_str_dups by computing its name similarity. + // Next, we add each dup in name_str_dups to each other collection by computing the + // similarity of the respective fields. Thus, every dup exists in all five + // collections. Finally, we aggregate these collections using this equation: + // `average(max(name_str_sim, name_meta_sim * 0.9), email_sim, phone_sim, address_sim)` + // If a field is missing (e.g. either possible dup record doesn't have an email), + // we use a value of -1, and all values less than 0 are ignored by the average. + // (This avoids overlooking records that are missing a lot of data.) + // + // The concatenation strategy is simpler. We simply concatenate all relevant + // fields (collection concats), then search for dups (collection concat_dups) + // on the concatinated data. + // + // Each strategy has pros and cons. + // - Aggregation avoids overlooking records when one is missing information. + // - Aggregation provides clearer reasoning to the database administrator. + // - Aggregation can use Levenstein for some fields (name_meta and phone), + // and cosine for others. Contamination uses the same measure for everything. + // - Concatenation finds dups where someone's email is similar to another + // person's name, which can tip us off to relationships we'd otherwise miss. + // - Concatenation is slightly faster, so it can be run with a lower threshold, + // making it more sensitive to near dups in some cases. + // - Concatenation is prone to cryptic false positives. + // + // In short, aggregation gives us more control, allowing us to find far more dups, + // but it comes at a cost of greater complexity, slower compute times, and missing + // certain specific cases of duplicates. + // + // Note: If concat_dups detects a dup, we also run run the other field checks on it + // even though concatenation similarity is not used in the aggregation equation. + // This is because it helps to enhance the reason field with additional info + // that the database administrator can see at a glace. + + sql = " + DECLARE object value; + + -- The number of places to show after the decimal point for reason + -- similarity percentages. + SELECT :value:reason_decimals = 1; + + -- The minimum aggregated similarity threshold that must be reached + -- for a duplicate to be added to the table and displayed to the user. + -- Duplicates which aggregate to a lower similarity than this do are + -- dropped before being added even if one attribute happens to have a + -- has high similarity. + -- For other similarity thresholds, see `dups.cluster`. + SELECT :value:min_total_sim = 0.60; + + print 'Getting data...' + -- Get collections for each set of possible dups. + DECLARE collection name_strs; + DECLARE collection name_metas; + DECLARE collection emails; + DECLARE collection phones; + DECLARE collection addresses; + DECLARE collection concats; + INSERT INTO collection name_strs SELECT :key, :name_str FROM /apps/kardia/modules/base/dups/get/name_strs.qy ; + INSERT INTO collection name_metas SELECT :key, :name_meta FROM /apps/kardia/modules/base/dups/get/name_metas.qy ; + INSERT INTO collection emails SELECT :key, :email FROM /apps/kardia/modules/base/dups/get/emails.qy ; + INSERT INTO collection phones SELECT :key, :phone FROM /apps/kardia/modules/base/dups/get/phones.qy ; + INSERT INTO collection addresses SELECT :key, :address FROM /apps/kardia/modules/base/dups/get/addresses.qy ; + INSERT INTO collection concats SELECT :key, :data FROM /apps/kardia/modules/base/dups/get/concats.qy ; + + print 'Counting data...' + -- Count data. + SELECT :value:name_strs_count = count(1) FROM collection name_strs; + SELECT :value:name_metas_count = count(1) FROM collection name_metas; + SELECT :value:emails_count = count(1) FROM collection emails; + SELECT :value:phones_count = count(1) FROM collection phones; + SELECT :value:addresses_count = count(1) FROM collection addresses; + SELECT :value:concats_count = count(1) FROM collection concats; + + print 'Computing parameters...' + -- Compute searching parameters. + SELECT + :value:name_strs_algorithm = :name_strs:algorithm, + :value:name_metas_algorithm = :name_metas:algorithm, + :value:emails_algorithm = :emails:algorithm, + :value:phones_algorithm = :phones:algorithm, + :value:addresses_algorithm = :addresses:algorithm, + :value:concats_algorithm = :concats:algorithm, + :value:name_strs_k = :name_strs:k, + :value:name_metas_k = :name_metas:k, + :value:emails_k = :emails:k, + :value:phones_k = :phones:k, + :value:addresses_k = :addresses:k, + :value:concats_k = :concats:k + FROM + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:name_strs_count) name_strs, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:name_metas_count) name_metas, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:emails_count) emails, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:phones_count) phones, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:addresses_count) addresses, + expression ('/apps/kardia/modules/base/dups/cluster_params.qy?num_data=' + :value:concats_count) concats + ; + + + print 'Searching for name_str_dups...' + -- Get name_str_dups using the dups.cluster file. + DECLARE collection name_str_dups; + INSERT INTO + collection name_str_dups + SELECT + :key1, + :key2, + :sim + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:name_strs_algorithm + + '&k=' + :value:name_strs_k + + '&field=name_strs' + + '&data=name_str' + + '/dups' + ) + ; + + print 'Searching for name_meta dups...' + -- Get name_meta_dups using the dups.cluster file. + DECLARE collection name_meta_dups; + INSERT INTO + collection name_meta_dups + SELECT + key1 = :key1, + key2 = :key2, + sim = :sim + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:name_metas_algorithm + + '&k=' + :value:name_metas_k + + '&field=name_metas' + + '&data=name_meta' + + '/meta_dups' + ) + ; + + print 'Searching for email dups...' + -- Get email_dups using the dups.cluster file. + DECLARE collection email_dups; + INSERT INTO + collection email_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:emails_algorithm + + '&k=' + :value:emails_k + + '&field=emails' + + '&data=email' + + '/dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + print 'Searching for phone dups...' + -- Get email_dups using the dups.cluster file. + DECLARE collection phone_dups; + INSERT INTO + collection phone_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:phones_algorithm + + '&k=' + :value:phones_k + + '&field=phones' + + '&data=phone' + + '/phone_dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + print 'Searching for address dups...' + -- Get address_dups using the dups.cluster file. + DECLARE collection address_dups; + INSERT INTO + collection address_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:addresses_algorithm + + '&k=' + :value:addresses_k + + '&field=addresses' + + '&data=address' + + '/dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + print 'Searching for concat dups...' + -- Get concat_dups using the dups.cluster file. + DECLARE COLLECTION concat_dups; + INSERT INTO + collection concat_dups + SELECT + key1 = :d:key1, + key2 = :d:key2, + sim = max(:d:sim) + FROM + identity expression ( + '/apps/kardia/modules/base/dups/dups.cluster' + + '?algorithm=' + :value:concats_algorithm + + '&k=' + :value:concats_k + + '&field=concats' + + '&data=data' + + '/concat_dups' + ) d + GROUP BY + :d:key1, + :d:key2 + ; + + + print 'Adding other collections dups to name_str_dups...' + print '[name_str_dups <- name_meta_dups]' + -- [name_str_dups <- name_meta_dups] Add an entry to name_str_dups for each entry in name_meta_dups. + INSERT INTO + collection name_str_dups + SELECT + key1 = :name_meta_dup:key1, + key2 = :name_meta_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) + ) + FROM + identity collection name_meta_dups name_meta_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 + WHERE + :name_meta_dup:key1 *= :name_str_dup:key1 + AND :name_meta_dup:key2 *= :name_str_dup:key2 + AND :name_meta_dup:key1 *= :name_str1:key + AND :name_meta_dup:key2 *= :name_str2:key + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) + ) + ; + + print '[name_str_dups <- email_dups]' + -- [name_str_dups <- email_dups] Add an entry to name_str_dups for each entry in email_dups. + INSERT INTO + collection name_str_dups + SELECT + key1 = :email_dup:key1, + key2 = :email_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) + ) + FROM + identity collection email_dups email_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 + WHERE + :email_dup:key1 *= :name_str_dup:key1 + AND :email_dup:key2 *= :name_str_dup:key2 + AND :email_dup:key1 *= :name_str1:key + AND :email_dup:key2 *= :name_str2:key + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) + ) + ; + + print '[name_str_dups <- phone_dups]' + -- [name_str_dups <- phone_dups] Add an entry to name_str_dups for each entry in phone_dups. + INSERT INTO + collection name_str_dups + SELECT + key1 = :phone_dup:key1, + key2 = :phone_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) + ) + FROM + identity collection phone_dups phone_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 + WHERE + :phone_dup:key1 *= :name_str_dup:key1 + AND :phone_dup:key2 *= :name_str_dup:key2 + AND :phone_dup:key1 *= :name_str1:key + AND :phone_dup:key2 *= :name_str2:key + GROUP BY + :name_str_dup:key1, + :name_str_dup:key2 + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) + ) + ; + + print '[name_str_dups <- address_dups]' + -- [name_str_dups <- address_dups] Add an entry to name_str_dups for each entry in address_dups. + INSERT INTO + collection name_str_dups + SELECT + key1 = :address_dup:key1, + key2 = :address_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) + ) + FROM + identity collection address_dups address_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 + WHERE + :address_dup:key1 *= :name_str_dup:key1 + AND :address_dup:key2 *= :name_str_dup:key2 + AND :address_dup:key1 *= :name_str1:key + AND :address_dup:key2 *= :name_str2:key + GROUP BY + :name_str_dup:key1, + :name_str_dup:key2 + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + max(isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0)) + ) + ; + + print '[name_str_dups <- concat_dups]' + -- [name_str_dups <- concat_dups] Add an entry to name_str_dups for each entry in concat_dups. + -- This isn't strictly necessary because concat_dups isn't used in the final aggregation. + -- However, it can sometimes provide additional information in the 'reason' column of the + -- UI, so this feels like it's worth the small cost in extra computation. + INSERT INTO + collection name_str_dups + SELECT + key1 = :concat_dup:key1, + key2 = :concat_dup:key2, + sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) + ) + FROM + identity collection concat_dups concat_dup, + collection name_str_dups name_str_dup, + collection name_strs name_str1, + collection name_strs name_str2 + WHERE + :concat_dup:key1 *= :name_str_dup:key1 + AND :concat_dup:key2 *= :name_str_dup:key2 + AND :concat_dup:key1 *= :name_str1:key + AND :concat_dup:key2 *= :name_str2:key + ON duplicate -- Update entries that already exist (in case we have a cross-cluster match). + :key1, + :key2 + UPDATE SET + :sim = condition(:name_str_dup:sim >= 0.0, + :name_str_dup:sim, + isnull(cos_compare(:name_str1:name_str, :name_str2:name_str), -1.0) + ) + ; + + + print 'Adding name_str_dups to other collections...' + print '[name_meta_dups <- name_str_dups]' + -- [name_meta_dups <- name_str_dups] Add an entry to name_meta_dups for each entry in name_str_dups. + INSERT INTO + collection name_meta_dups + SELECT + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:name_meta_dup:sim >= 0.0, + :name_meta_dup:sim, + isnull(lev_compare(:name_meta1:name_meta, :name_meta2:name_meta), -1.0) + ) + FROM + identity collection name_str_dups name_str_dup, + collection name_meta_dups name_meta_dup, + collection name_metas name_meta1, + collection name_metas name_meta2 + WHERE + :name_str_dup:key1 *= :name_meta_dup:key1 + AND :name_str_dup:key2 *= :name_meta_dup:key2 + AND :name_str_dup:key1 *= :name_meta1:key + AND :name_str_dup:key2 *= :name_meta2:key + GROUP BY + :name_str_dup:key1, + :name_str_dup:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:name_meta_dup:sim >= 0.0, + :name_meta_dup:sim, + isnull(lev_compare(:name_meta1:name_meta, :name_meta2:name_meta), -1.0) + ) + ; + + print '[email_dups <- name_str_dups]' + -- [email_dups <- name_str_dups] Add an entry to email_dups for each entry in name_str_dups. + INSERT INTO + collection email_dups + SELECT + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:email_dup:sim >= 0.0, + :email_dup:sim, + max(isnull(cos_compare(:email1:email, :email2:email), -1.0)) + ) + FROM + identity collection name_str_dups name_str_dup, + collection email_dups email_dup, + collection emails email1, + collection emails email2 + WHERE + :name_str_dup:key1 *= :email_dup:key1 + AND :name_str_dup:key2 *= :email_dup:key2 + AND :name_str_dup:key1 *= :email1:key + AND :name_str_dup:key2 *= :email2:key + GROUP BY + :name_str_dup:key1, + :name_str_dup:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:email_dup:sim >= 0.0, + :email_dup:sim, + max(isnull(cos_compare(:email1:email, :email2:email), -1.0)) + ) + ; + + print '[phone_dups <- name_str_dups]' + -- [phone_dups <- name_str_dups] Add an entry to phone_dups for each entry in name_str_dups. + INSERT INTO + collection phone_dups + SELECT + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:phone_dup:sim >= 0.0, + :phone_dup:sim, + max(isnull(lev_compare(:phone1:phone, :phone2:phone), -1.0)) + ) + FROM + identity collection name_str_dups name_str_dup, + collection phone_dups phone_dup, + collection phones phone1, + collection phones phone2 + WHERE + :name_str_dup:key1 *= :phone_dup:key1 + AND :name_str_dup:key2 *= :phone_dup:key2 + AND :name_str_dup:key1 *= :phone1:key + AND :name_str_dup:key2 *= :phone2:key + GROUP BY + :name_str_dup:key1, + :name_str_dup:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:phone_dup:sim >= 0.0, + :phone_dup:sim, + max(isnull(lev_compare(:phone1:phone, :phone2:phone), -1.0)) + ) + ; + + print '[address_dups <- name_str_dups]' + -- [address_dups <- name_str_dups] Add an entry to address_dups for each entry in name_str_dups. + INSERT INTO + collection address_dups + SELECT + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = condition(:address_dup:sim >= 0.0, + :address_dup:sim, + max(isnull(cos_compare(:address1:address, :address2:address), -1.0)) + ) + FROM + identity collection name_str_dups name_str_dup, + collection address_dups address_dup, + collection addresses address1, + collection addresses address2 + WHERE + :name_str_dup:key1 *= :address_dup:key1 + AND :name_str_dup:key2 *= :address_dup:key2 + AND :name_str_dup:key1 *= :address1:key + AND :name_str_dup:key2 *= :address2:key + GROUP BY + :name_str_dup:key1, + :name_str_dup:key2 + ON duplicate -- Skip duplicate entries. + :key1, + :key2 + UPDATE SET + :sim = condition(:address_dup:sim >= 0.0, + :address_dup:sim, + max(isnull(cos_compare(:address1:address, :address2:address), -1.0)) + ) + ; + + -- Free application-scoped collections. + DELETE FROM collection name_strs; + DELETE FROM collection name_metas; + DELETE FROM collection emails; + DELETE FROM collection phones; + DELETE FROM collection addresses; + + -- Declare a collection to hold all dups found by both strategies (aggregation and concatenation). + DECLARE collection all_dups; + + print 'Aggregating dups...' + -- Aggregate dups. + INSERT INTO + collection all_dups + SELECT + key1 = :name_str_dup:key1, + key2 = :name_str_dup:key2, + sim = (0.0 + + condition(constrain(:name_str_dup:sim, :name_meta_dup:sim * 0.9, 1.0) >= 0.0, constrain(:name_str_dup:sim, :name_meta_dup:sim * 0.9, 1.0), 0.0) + + condition(:email_dup:sim >= 0.0, :email_dup:sim, 0.0) + + condition(:phone_dup:sim >= 0.0, :phone_dup:sim, 0.0) + + condition(:address_dup:sim >= 0.0, :address_dup:sim, 0.0) + ) / (constrain(0.0 -- Constrain to prevent divide by 0 when NANs wander into the data from alternate dimensions. + + condition(:name_str_dup:sim >= 0.0 OR :name_meta_dup:sim >= 0.0, 1.0, 0.0) + + condition(:email_dup:sim >= 0.0, 1.0, 0.0) + + condition(:phone_dup:sim >= 0.0, 1.0, 0.0) + + condition(:address_dup:sim >= 0.0, 1.0, 0.0) + , 0.0000001, convert(double, NULL)) -- Necessary because centrallix devision does not handle NAN properly. + ), + reason = '' + + condition(:name_str_dup:sim > 0.0, 'Name (' + round(:name_str_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:name_meta_dup:sim > 0.0 AND (:name_meta_dup:sim - 0.00001) > :name_str_dup:sim, 'Phonetic Name (' + round(:name_meta_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:email_dup:sim > 0.0, 'Email (' + round(:email_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:phone_dup:sim > 0.0, 'Phone (' + round(:phone_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + + condition(:address_dup:sim > 0.0, 'Address (' + round(:address_dup:sim * 100, :value:reason_decimals) + '%)\\n', '') + - '\\n' + FROM + identity collection name_str_dups name_str_dup, + collection name_meta_dups name_meta_dup, + collection email_dups email_dup, + collection phone_dups phone_dup, + collection address_dups address_dup + WHERE + :name_str_dup:key1 = :name_meta_dup:key1 + AND :name_str_dup:key2 = :name_meta_dup:key2 + AND :name_str_dup:key1 = :email_dup:key1 + AND :name_str_dup:key2 = :email_dup:key2 + AND :name_str_dup:key1 = :phone_dup:key1 + AND :name_str_dup:key2 = :phone_dup:key2 + AND :name_str_dup:key1 = :address_dup:key1 + AND :name_str_dup:key2 = :address_dup:key2 + ; + + + print 'Adding concatenation dups...' + -- Full Concat Dups + INSERT INTO + collection all_dups + SELECT + key1 = :concat_dup:key1, + key2 = :concat_dup:key2, + sim = :concat_dup:sim, + reason = 'All (' + round(:concat_dup:sim * 100, :value:reason_decimals) + '%)' + FROM + identity collection concat_dups concat_dup + ON duplicate + :key1, + :key2 + UPDATE SET + :sim = condition(:sim > :concat_dup:sim, :sim, :concat_dup:sim), + :reason = 'All (' + round(:concat_dup:sim * 100, :value:reason_decimals) + '%)\\n' + :reason + ; + + + -- Remove all data from the dups table (for debugging). + delete from /apps/kardia/data/Kardia_DB/p_dup/rows ; + + print 'Storing dups...' + -- TODO: Greg - We should fix this upsert. + -- Insert dups into p_dup table. + INSERT INTO + /apps/kardia/data/Kardia_DB/p_dup/rows + SELECT + p_partner_key = :key1, + p_dup_partner_key = :key2, + p_match_quality = :sim, + p_reason = :reason, + + -- Required fields. + s_date_created = getdate(), + s_created_by = user_name(), + s_date_modified = getdate(), + s_modified_by = user_name() + FROM + collection all_dups + WHERE -- We should never get identical dups, but somehow we do. This fixes that issue. + :all_dups:key1 != :all_dups:key2 + AND :sim > :value:min_total_sim + ON duplicate + :p_partner_key, + :p_dup_partner_key + UPDATE SET + :p_match_quality = :sim, + :p_reason = :reason, + :s_date_modified = getdate(), + :s_modified_by = user_name() + ; + + -- Remove stale data from the dups table. +-- DELETE +-- FROM +-- identity /apps/kardia/data/Kardia_DB/p_dup/rows d, +-- /apps/kardia/data/Kardia_DB/p_partner/rows p1, +-- /apps/kardia/data/Kardia_DB/p_partner/rows p2 +-- WHERE +-- :d:p_partner_key *= :p1:p_partner_key +-- AND :d:p_dup_partner_key *= :p2:p_partner_key +-- AND(:d:s_date_modified < isnull(:p1:s_date_modified, getdate()) +-- OR :d:s_date_modified < isnull(:p2:s_date_modified, getdate())) +-- ; + print 'Update complete' + "; + } diff --git a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp index 12537d6c..9b6691ea 100644 --- a/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp +++ b/kardia-app/modules/base/plugin_base_dataqa_duplicates.cmp @@ -44,15 +44,25 @@ plugin_base_dataqa_duplicates "widget/component-decl" width=958; height=633; spacing=10; - - dupslbl "widget/component" + + dups_label "widget/pane" { - path="/apps/kardia/modules/base/section_label.cmp"; - height=26; - fl_height=0; - text = runclient("Potential Duplicate Partners..."); + height = 32; width = 958; + fl_height = 0; + border_radius = 8; + widget_class = "label"; + + lbl "widget/label" + { + x = 6; y = 6; + width = 190; height = 18; + font_size = 13; + + widget_class = "label"; + text = "Potential Duplicate Partners..."; + } } - + dupsosrc "widget/osrc" { showobs "widget/parameter" { type=integer; default=runclient(:show_obs:value); } @@ -112,31 +122,53 @@ plugin_base_dataqa_duplicates "widget/component-decl" :d:p_partner_key, :d:p_dup_partner_key, :d:p_match_quality, - --match = convert(integer, round(:d:p_match_quality * 100)), - match = round(:d:p_match_quality, 3), - disp_name_1 = condition(char_length(rtrim(:p1:p_org_name)) > 0, :p1:p_org_name + ' ' + condition(char_length(:p1:p_given_name + :p1:p_surname) > 0, '- ', ''), '') + isnull(:p1:p_given_name + ' ','') + isnull(:p1:p_surname + ' ',''), - disp_name_2 = condition(char_length(rtrim(:p2:p_org_name)) > 0, :p2:p_org_name + ' ' + condition(char_length(:p2:p_given_name + :p2:p_surname) > 0, '- ', ''), '') + isnull(:p2:p_given_name + ' ','') + isnull(:p2:p_surname + ' ',''), + match = round(:d:p_match_quality, 4), + disp_name_1 = '' + + condition(char_length(rtrim(:p1:p_org_name)) > 0, + :p1:p_org_name + ' ' + condition(char_length(:p1:p_given_name + :p1:p_surname) > 0, '- ', ''), + '') + + isnull(:p1:p_given_name + ' ', '') + + isnull(:p1:p_surname + ' ', ''), + disp_name_2 = '' + + condition(char_length(rtrim(:p2:p_org_name)) > 0, + :p2:p_org_name + ' ' + condition(char_length(:p2:p_given_name + :p2:p_surname) > 0, '- ', ''), + '') + + isnull(:p2:p_given_name + ' ', '') + + isnull(:p2:p_surname + ' ', ''), stat_1 = :p1:p_status_code, stat_2 = :p2:p_status_code, - --loc_1 = :l1:p_city + ', ' + :l1:p_state_province, - --loc_2 = :l2:p_city + ', ' + :l2:p_state_province, - --contact_1 = isnull(:c1:p_phone_country + ' ','') + isnull(:c1:p_phone_area_city + ' ','') + isnull(:c1:p_contact_data,''), - --contact_2 = isnull(:c2:p_phone_country + ' ','') + isnull(:c2:p_phone_area_city + ' ','') + isnull(:c2:p_contact_data,''), ploc_1 = substitute(isnull(:af1:p_format, '[:p_in_care_of]\n[:p_address_1]\n[:p_address_2]\n[:p_address_3]\n[:p_city], [:p_state_province] [:p_postal_code]\n[:p_country_name]'), 'l=pl1,p=p1,ctry1'), ploc_2 = substitute(isnull(:af2:p_format, '[:p_in_care_of]\n[:p_address_1]\n[:p_address_2]\n[:p_address_3]\n[:p_city], [:p_state_province] [:p_postal_code]\n[:p_country_name]'), 'l=pl2,p=p2,ctry2'), - type = condition(:d:p_partner_key != :d:p_dup_partner_key, 'Duplicate Partner', condition(:d:p_contact_id is not null, 'Duplicate Contact', condition(:d:p_location_id is not null, 'Duplicate Address', ''))), - nondup = condition(:nd:p_partner_key is not null, 1, 0), - associated = condition(:p1:p_parent_key = :p2:p_partner_key or :p1:p_partner_key = :p2:p_parent_key or :p1:p_parent_key = :p2:p_parent_key, 1, 0), - merged = condition(:p1:p_merged_with = :p2:p_partner_key or :p1:p_partner_key = :p2:p_merged_with, 1, 0), - relation = (select condition(:r:p_partner_key = :d:p_partner_key, :rt:p_relation_type_label + '/' + :rt:p_relation_type_rev_label, :rt:p_relation_type_rev_label + '/' + :rt:p_relation_type_label) from collection rels r, collection reltype rt where ((:r:p_partner_key = :d:p_partner_key and :r:p_relation_key = :d:p_dup_partner_key) or (:r:p_partner_key = :d:p_dup_partner_key and :r:p_relation_key = :d:p_partner_key)) and :rt:p_relation_type = :r:p_relation_type ) + + -- Dupe type info. + reason = :d:p_reason, + nondup = condition(char_length(isnull(:nd:p_partner_key, '')) > 1 and char_length(isnull(:nd:p_nondup_partner_key, '')) > 1, 1, 0), + associated = condition( + :p1:p_parent_key = :p2:p_partner_key + or :p1:p_partner_key = :p2:p_parent_key + or :p1:p_parent_key = :p2:p_parent_key, + 1, 0), + merged = condition( + :p1:p_merged_with = :p2:p_partner_key + or :p1:p_partner_key = :p2:p_merged_with, + 1, 0), + relation = ( + select + condition(:r:p_partner_key = :d:p_partner_key, + :rt:p_relation_type_label + '/' + :rt:p_relation_type_rev_label, + :rt:p_relation_type_rev_label + '/' + :rt:p_relation_type_label + ) + from + collection rels r, + collection reltype rt + where :rt:p_relation_type = :r:p_relation_type + and ((:r:p_partner_key = :d:p_partner_key and :r:p_relation_key = :d:p_dup_partner_key) or + (:r:p_partner_key = :d:p_dup_partner_key and :r:p_relation_key = :d:p_partner_key )) + ) from identity /apps/kardia/data/Kardia_DB/p_dup/rows d, /apps/kardia/data/Kardia_DB/p_partner/rows p1, /apps/kardia/data/Kardia_DB/p_partner/rows p2, - --/apps/kardia/data/Kardia_DB/p_location/rows l1, - --/apps/kardia/data/Kardia_DB/p_location/rows l2, - --/apps/kardia/data/Kardia_DB/p_contact_info/rows c1, - --/apps/kardia/data/Kardia_DB/p_contact_info/rows c2, /apps/kardia/data/Kardia_DB/p_location/rows pl1, /apps/kardia/data/Kardia_DB/p_location/rows pl2, /apps/kardia/data/Kardia_DB/p_country/rows ctry1, @@ -146,30 +178,19 @@ plugin_base_dataqa_duplicates "widget/component-decl" /apps/kardia/data/Kardia_DB/p_nondup/rows nd where (:info:partnerlist is null or charindex(',' + rtrim(:d:p_partner_key) + ',', ',' + :info:partnerlist + ',') > 0 or charindex(',' + rtrim(:d:p_dup_partner_key) + ',', ',' + :info:partnerlist + ',') > 0) and - :p1:p_partner_key = :d:p_partner_key and - :p2:p_partner_key = :d:p_dup_partner_key and - --:l1:p_partner_key =* :d:p_partner_key and - --:l1:p_location_id =* :d:p_location_id and - --:l1:p_revision_id =* :d:p_revision_id and - --:l2:p_partner_key =* :d:p_dup_partner_key and - --:l2:p_location_id =* :d:p_dup_location_id and - --:l2:p_revision_id =* :d:p_dup_revision_id and - --:c1:p_partner_key =* :d:p_partner_key and - --:c1:p_contact_id =* :d:p_contact_id and - --:c2:p_partner_key =* :d:p_dup_partner_key and - --:c2:p_contact_id =* :d:p_dup_contact_id and - :pl1:p_partner_key =* :d:p_partner_key and - :pl2:p_partner_key =* :d:p_dup_partner_key and - :pl1:p_country_code *= :ctry1:p_country_code and - :af1:p_country_code =* :ctry1:p_country_code and - :af1:p_address_set = 'STANDARD' and - :pl2:p_country_code *= :ctry2:p_country_code and - :af2:p_country_code =* :ctry2:p_country_code and - :af2:p_address_set = 'STANDARD' and - :nd:p_partner_key =* :d:p_partner_key and - :nd:p_nondup_partner_key =* :p_dup_partner_key + :d:p_partner_key = :p1:p_partner_key and + :d:p_dup_partner_key = :p2:p_partner_key and + :d:p_partner_key *= :pl1:p_partner_key and + :d:p_dup_partner_key *= :pl2:p_partner_key and + :d:p_partner_key *= :nd:p_partner_key and + :d:p_dup_partner_key *= :nd:p_nondup_partner_key and + :pl1:p_country_code *= :ctry1:p_country_code and + :pl2:p_country_code *= :ctry2:p_country_code and + :af1:p_country_code =* :ctry1:p_country_code and + :af2:p_country_code =* :ctry2:p_country_code and + :af1:p_address_set = 'STANDARD' and + :af2:p_address_set = 'STANDARD' group by - :d:p_match_quality desc, :d:p_partner_key, :d:p_dup_partner_key order by @@ -207,13 +228,112 @@ plugin_base_dataqa_duplicates "widget/component-decl" row_shadow_offset=1; row_shadow_color="#a0a0a0"; row_shadow_angle=135; - nodata_message="(no duplicates to show)"; + nodata_message="(no duplicates to show, congrats!)"; - //t_match "widget/table-column" { title="%"; value=runclient(:dupsosrc:match + '%'); width=60; style=bold; } - t_match "widget/table-column" { width=80; title = "%"; type=progress; padding=4; style=bold; fieldname=match; bar_color=runclient(condition(:dupsosrc:match >= 0.90, '#59b550', condition(:dupsosrc:match < 0.80, '#d96066', '#DDB261'))); bar_padding=3; bar_textcolor=black; } - t_type "widget/table-column" { title="%"; value=runclient(:dupsosrc:type + condition(:dupsosrc:merged, '\nAlready Merged', '') + condition(:dupsosrc:associated, '\nAssociated', '') + condition(:dupsosrc:relation is not null, '\nRelated: ' + :dupsosrc:relation, '')); width=180; style=bold; align=center; wrap=yes; } - t_par2 "widget/table-column" { title="Partner"; value=runclient(:dupsosrc:p_dup_partner_key + ' ' + :dupsosrc:disp_name_2); width=250; style=bold; caption_value=runclient(isnull(:dupsosrc:ploc_2 + '\n', '') + condition(:dupsosrc:stat_2 = 'O', '(obsolete)', '') - '\n'); wrap=yes; } - t_par1 "widget/table-column" { title="Partner"; value=runclient(condition(:dupsosrc:p_partner_key = :dupsosrc:p_dup_partner_key, '', :dupsosrc:p_partner_key + ' ' + :dupsosrc:disp_name_1)); width=250; style=bold; caption_value=runclient(condition(:dupsosrc:p_partner_key = :dupsosrc:p_dup_partner_key, '', isnull(:dupsosrc:ploc_1 + '\n', '') + condition(:dupsosrc:stat_1 = 'O', '(obsolete)', '') - '\n')); wrap=yes; } + // Similarity progress bar. + t_match "widget/table-column" + { + width = 80; + style = bold; + padding = 4; + title = "%"; + + type = progress; + fieldname = match; + + bar_color = runclient( + condition(:dupsosrc:match >= 1.00, '#63c85b', + condition(:dupsosrc:match >= 0.95, '#86c65c', + condition(:dupsosrc:match >= 0.90, '#aac45d', + condition(:dupsosrc:match >= 0.85, '#cdc15e', + condition(:dupsosrc:match >= 0.80, '#f0bf5f', + condition(:dupsosrc:match >= 0.75, '#edaa63', + condition(:dupsosrc:match >= 0.70, '#ea9467', + condition(:dupsosrc:match >= 0.65, '#e77f6b', '#e46a6f' + ))))))))); + bar_padding = 3; + bar_textcolor = black; + + // The above bar_color values were generated using this script in JS. + // The hex function was generated by GPT5-mini using t3.chat with direct + // oversight from Israel, who wrote the rest of the code there after. + // + // function hex(v) { + // const clamp = (x, a = 0, b = 1) => Math.min(b, Math.max(a, x)); + // const lerp = (a, b, t) => a + (b - a) * t; + // const toHex2 = (n) => Math.round(n).toString(16).padStart(2, "0"); + // + // const t = clamp(v); + // + // const R = { r: 0xe4, g: 0x6a, b: 0x6f }; // #e46a6f + // const Y = { r: 0xf0, g: 0xbf, b: 0x5f }; // #f0bf5f + // const G = { r: 0x63, g: 0xc8, b: 0x5b }; // #63c85b + // + // if (t <= 0.6) return `#${toHex2(R.r)}${toHex2(R.g)}${toHex2(R.b)}`; + // if (t <= 0.8) { + // const u = (t - 0.6) / 0.2; + // const r = lerp(R.r, Y.r, u); + // const g = lerp(R.g, Y.g, u); + // const b = lerp(R.b, Y.b, u); + // return `#${toHex2(r)}${toHex2(g)}${toHex2(b)}`; + // } + // const u = (t - 0.8) / 0.2; + // const r = lerp(Y.r, G.r, u); + // const g = lerp(Y.g, G.g, u); + // const b = lerp(Y.b, G.b, u); + // return `#${toHex2(r)}${toHex2(g)}${toHex2(b)}`; + // } + // + // const hex_r = (sim, step) => (sim < 0.6) + // ? ` '${hex(sim)}'\n\t\t ` + // : `\n\t\t\tcondition(:dupsosrc:match >= ${Math.round(sim * 1000) / 1000}, '${hex(sim)}',${hex_r(sim - step, step)})`; + // console.log(`runclient(${hex_r(1.0, 0.05)});`); + } + + // Dulicate reason, and other information about the type of duplicate. + t_type "widget/table-column" + { + width = 180; + style = bold; + align = center; + wrap = yes; + title = "%"; + + value = runclient( + isnull(:dupsosrc:reason, "Reason missing") + + condition(:dupsosrc:merged, '\nAlready Merged', '') + + condition(:dupsosrc:associated, '\nAssociated', '') + + condition(:dupsosrc:relation is not null, '\nRelated: ' + :dupsosrc:relation, '') + ); + } + + // Duplicate record 1. + t_dup1 "widget/table-column" + { + width = 250; + style = bold; + wrap = yes; + title = "Partner"; + + value = runclient(:dupsosrc:p_dup_partner_key + ' ' + :dupsosrc:disp_name_2); + caption_value = runclient(isnull(:dupsosrc:ploc_2 + '\n', '') + condition(:dupsosrc:stat_2 = 'O', '(obsolete)', '') - '\n'); + } + + // Duplicate record 2. + t_dup2 "widget/table-column" + { + width = 250; + style = bold; + wrap = yes; + title = "Partner"; + + value = runclient(:dupsosrc:p_partner_key + ' ' + :dupsosrc:disp_name_1); + caption_value = runclient( + isnull(:dupsosrc:ploc_1 + '\n', '') + + condition(:dupsosrc:stat_1 = 'O', '(obsolete)', '') + - '\n' + ); + } ops_detail "widget/table-row-detail" { @@ -361,6 +481,32 @@ plugin_base_dataqa_duplicates "widget/component-decl" } } } + + last_computed_osrc "widget/osrc" + { + sql = " + SELECT + display = 'Last updated:\n' + max(:s_date_modified), + dif_days = datediff(day, max(:s_date_modified), getdate()) + FROM + /apps/kardia/data/Kardia_DB/p_dup/rows + ; + "; + + last_computed_form "widget/form" + { + dups_last_updated_label "widget/label" + { + x = 840; y = 0; + height = 32; width = 110; + font_size = 11; + align = left; valign = middle; + fgcolor = runclient(condition(:last_computed_osrc:dif_days > 7, "#FF1133", "#CCCCCC")); + style = runclient(condition(:last_computed_osrc:dif_days > 7, bold, none)); + fieldname = display; + } + } + } popover_ask_comment "widget/childwindow" { diff --git a/kardia-app/modules/base/update_duplicates.qy b/kardia-app/modules/base/update_duplicates.qy deleted file mode 100644 index 058861dd..00000000 --- a/kardia-app/modules/base/update_duplicates.qy +++ /dev/null @@ -1,28 +0,0 @@ -$Version=2$ -update_duplicates "system/query" - { - sql = " delete - /apps/kardia/data/Kardia_DB/p_dup/rows - ; - - insert - /apps/kardia/data/Kardia_DB/p_dup/rows - select - p_partner_key = :p_partner_key_1, - p_dup_partner_key = :p_partner_key_2, - p_match_quality = :priority, - p_location_id = nullif(:p_location_id_1, ''), - p_dup_location_id = nullif(:p_location_id_2, ''), - p_revision_id = nullif(:p_revision_id_1, ''), - p_dup_revision_id = nullif(:p_revision_id_2, ''), - p_contact_id = nullif(:p_contact_id_1, ''), - p_dup_contact_id = nullif(:p_contact_id_2, ''), - --p_comment = :ca1 + ', ' + :ca2 + ', ' + :cc1 + ', ' + :cc2, - s_date_created = getdate(), - s_created_by = user_name(), - s_date_modified = getdate(), - s_modified_by = user_name() - from - /apps/kardia/modules/base/duplicate_checking.qy - "; - } diff --git a/kardia-scripts/cron/update_duplicates.sh b/kardia-scripts/cron/update_duplicates.sh index f241a5c2..b9c5b927 100755 --- a/kardia-scripts/cron/update_duplicates.sh +++ b/kardia-scripts/cron/update_duplicates.sh @@ -14,5 +14,5 @@ # Make sure /usr/local binaries are in the $PATH export PATH=$PATH:/usr/local/sbin:/usr/local/bin +# TODO: Greg - We need to update this cron to call the new file. /usr/local/bin/test_obj -c /usr/local/etc/centrallix.conf -u kardia -p $(cat /usr/local/etc/centrallix/kardia-auth) -q -C 'ls /apps/kardia/modules/base/update_duplicates.qy' 2>/dev/null >/dev/null -