diff --git a/augur/application/config.py b/augur/application/config.py index 058688dc80..c3736740c7 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -55,7 +55,6 @@ def redact_setting_value(section_name, setting_name, value): "rebuild_caches": 1, "run_analysis": 1, "run_facade_contributors": 1, - "facade_contributor_full_recollect": 0, "commit_messages": 1, }, "Server": { diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index a83d09390c..1115b63879 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -131,7 +131,6 @@ def __init__(self,logger: Logger): self.rebuild_caches = worker_options["rebuild_caches"] self.multithreaded = worker_options["multithreaded"] self.create_xlsx_summary_files = worker_options["create_xlsx_summary_files"] - self.facade_contributor_full_recollect = worker_options["facade_contributor_full_recollect"] self.commit_messages = worker_options["commit_messages"] self.tool_source = "Facade" diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 73ac3ee150..2ef5a6cd30 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -196,54 +196,25 @@ def insert_facade_contributors(self, repo_git): repo_id = repo.repo_id facade_helper = FacadeHelper(logger) - with get_session() as session: - query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) - collection_status = execute_session_query(query,'one') - last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None - - # Get all of the commit data's emails and names from the commit table that do not appear - # in the contributors table or the contributors_aliases table. + # Find commits not yet linked to a contributor (cmt_ght_author_id IS NULL), + # skipping emails already marked unresolvable. logger.info( "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) new_contrib_sql = s.sql.text(""" - SELECT DISTINCT - commits.cmt_author_name AS NAME, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'not_unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) - or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - UNION - SELECT DISTINCT - commits.cmt_author_name AS NAME,--commits.cmt_id AS id, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - ORDER BY - hash - """).bindparams(repo_id=repo_id,since_date=last_collected_date) + SELECT DISTINCT + commits.cmt_author_name AS NAME, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw + FROM + augur_data.commits + WHERE + commits.repo_id = :repo_id AND + commits.cmt_ght_author_id IS NULL AND + commits.cmt_author_raw_email NOT IN ( + SELECT email FROM augur_data.unresolved_commit_emails + ) + """).bindparams(repo_id=repo_id) #Execute statement with session. result = execute_sql(new_contrib_sql) @@ -278,37 +249,42 @@ def insert_facade_contributors(self, repo_git): logger.debug("DEBUG: Got through the new_contribs") - # sql query used to find corresponding cntrb_id's of emails found in the contributor's table - # i.e., if a contributor already exists, we use it! + # Match unlinked commits to contributors via email from any source (cntrb_email, canonical email, or alias). resolve_email_to_cntrb_id_sql = s.sql.text(""" - SELECT DISTINCT - cntrb_id, - contributors.cntrb_login AS login, - contributors.cntrb_canonical AS email, - commits.cmt_author_raw_email - FROM - contributors, - commits - WHERE - contributors.cntrb_canonical = commits.cmt_author_raw_email - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - AND commits.repo_id = :repo_id - UNION - SELECT DISTINCT - contributors_aliases.cntrb_id, - contributors.cntrb_login as login, - contributors_aliases.alias_email AS email, - commits.cmt_author_raw_email + WITH email_to_contributor AS ( + SELECT cntrb_email AS email, cntrb_id + FROM augur_data.contributors + WHERE cntrb_email IS NOT NULL + + UNION ALL + + SELECT cntrb_canonical AS email, cntrb_id + FROM augur_data.contributors + WHERE cntrb_canonical IS NOT NULL + + UNION ALL + + SELECT alias_email AS email, cntrb_id + FROM augur_data.contributors_aliases + WHERE alias_email IS NOT NULL + ), + deduplicated AS ( + SELECT DISTINCT ON (email) email, cntrb_id + FROM email_to_contributor + ORDER BY email + ) + SELECT + d.cntrb_id, + c.cmt_author_email AS email FROM - contributors, - contributors_aliases, - commits + augur_data.commits c + INNER JOIN + deduplicated d + ON c.cmt_author_email = d.email WHERE - contributors_aliases.alias_email = commits.cmt_author_raw_email - AND contributors.cntrb_id = contributors_aliases.cntrb_id - AND commits.repo_id = :repo_id - AND (:since_date is NULL OR commits.data_collection_date > :since_date) - """).bindparams(repo_id=repo_id,since_date=last_collected_date) + c.cmt_ght_author_id IS NULL AND + c.repo_id = :repo_id + """).bindparams(repo_id=repo_id) result = execute_sql(resolve_email_to_cntrb_id_sql)