diff --git a/run_autogvp.sh b/run_autogvp.sh index a62392c..cb24162 100644 --- a/run_autogvp.sh +++ b/run_autogvp.sh @@ -15,10 +15,6 @@ variant_summary_file="$BASEDIR/data/ClinVar-selected-submissions.tsv" # define parameter variables while [ $# -gt 0 ]; do case "$1" in - --workflow*|-w*) - if [[ "$1" != *=* ]]; then shift; fi # Value is next arg if no `=` - workflow="${1#*=}" - ;; --vcf*|-v*) if [[ "$1" != *=* ]]; then shift; fi vcf_file="${1#*=}" @@ -27,10 +23,6 @@ while [ $# -gt 0 ]; do if [[ "$1" != *=* ]]; then shift; fi filtering_criteria="${1#*=}" ;; - --clinvar*|-c*) - if [[ "$1" != *=* ]]; then shift; fi - clinvar_file="${1#*=}" - ;; --intervar*|-i*) if [[ "$1" != *=* ]]; then shift; fi intervar_file="${1#*=}" @@ -51,7 +43,11 @@ while [ $# -gt 0 ]; do if [[ "$1" != *=* ]]; then shift; fi out_file="${1#*=}" ;; - --selected_clinvar_submissions*) + --sample_id*|-s*) + if [[ "$1" != *=* ]]; then shift; fi + sample_id="${1#*=}" + ;; + --resolved_clinvar*|-c*) if [[ "$1" != *=* ]]; then shift; fi selected_submissions="${1#*=}" ;; @@ -76,23 +72,22 @@ while [ $# -gt 0 ]; do custom_output_cols="${1#*=}" ;; --help|-h) - echo "Usage: $0 [-w/--workflow] [-v/--vcf <*.vcf*>] [-f/--filter_criteria=] [-c/--clinvar <*.vcf>] [-i/--intervar <*.txt.intervar>] [-a/--autopvs1 <*autopvs1*>] [-m/--multianno <*multianno*>] [-o/--out ]" + echo "Usage: $0 [-v/--vcf <*.vcf*>] [-f/--filter_criteria=] [-i/--intervar <*.txt.intervar>] [-a/--autopvs1 <*autopvs1*>] [-m/--multianno <*multianno*>] [-c/--resolved_clinvar ] [-o/--out ]" echo "" echo "Options:" - echo " -w/--workflow workflow" echo " -v/--vcf VCF file" echo " -f/--filter_criteria VCF filtering criteria" - echo " -c/--clinvar ClinVar file" echo " -i/--intervar Intervar results file" echo " -a/--autopvs1 Autopvs1 results file" echo " -m/--multianno ANNOVAR file" echo " -O/--outdir output directory" echo " -o/--out output prefix" - echo " --selected_clinvar_submissions ClinVar variant file with conflicts resolved" + echo " -s/--sample_id sample ID to be added to the output file" + echo " -c/--resolved_clinvar ClinVar variant file with conflicts resolved" echo " --variant_summary ClinVar variant summary file" echo " --submission_summary ClinVar submission summary file" - echo " --conceptIDs list of conceptIDs to prioritize submissions for clinvar variant conflict resolution. Will be ignored if selected_clinvar_submissions is provided" - echo " --conflict_res how to resolve conflicts associated with conceptIDs. Will be ignored if selected_clinvar_submissions is provided or if conceptIDs are not provided" + echo " --conceptIDs list of conceptIDs to prioritize submissions for clinvar variant conflict resolution. Will be ignored if --resolved_clinvar is provided" + echo " --conflict_res how to resolve conflicts associated with conceptIDs. Will be ignored if --resolved_clinvar is provided or if conceptIDs are not provided" echo " --custom_output_cols optional; text file of user-defined column names from VCF info fields or other input file to be included in AutoGVP output files. Must contain three columns named 'Column_name', 'Rename' (i.e., what to rename colum in final output), and 'Abridged' (T or F indicating if column should be included in abridged output)" echo " -h/--help Display usage information" exit 0 @@ -122,7 +117,7 @@ if [[ ! -e $selected_submissions ]]; then submission_summary=$(find data/ -type f -name "submission_summary*") # if no files found matching pattern, download latest versions from ClinVar - # if [[ -z "$variant_summary" || -z "$submission_summary" ]] + # if [[ -z "$variant_summary" || -z "$submission_summary" ]] if [[ ! -e "$variant_summary" || ! -e "$submission_summary" ]] then @@ -145,7 +140,7 @@ if [[ ! -e $selected_submissions ]]; then echo "resolving ClinVar conflicts using default parameters..." - Rscript $BASEDIR/scripts/select-clinVar-submissions.R --variant_summary $variant_summary --submission_summary $submission_summary --outdir $out_dir + Rscript $BASEDIR/scripts/resolve-clinvar-intepretations.R --variant_summary $variant_summary --submission_summary $submission_summary --outdir $BASEDIR/refs fi @@ -153,7 +148,7 @@ if [[ ! -e $selected_submissions ]]; then echo "resolving ClinVar conflicts with provided concept IDs and taking latest date evaluated call..." - Rscript $BASEDIR/scripts/select-clinVar-submissions.R --variant_summary $variant_summary --submission_summary $submission_summary --outdir $out_dir --conceptID_list $conceptIDs --conflict_res "latest" + Rscript $BASEDIR/scripts/resolve-clinvar-intepretations.R --variant_summary $variant_summary --submission_summary $submission_summary --outdir $BASEDIR/refs --conceptID_list $conceptIDs --conflict_res "latest" fi @@ -161,11 +156,11 @@ if [[ ! -e $selected_submissions ]]; then echo "resolving ClinVar conflicts with provided concept IDs and specified conflict resolution..." - Rscript $BASEDIR/scripts/select-clinVar-submissions.R --variant_summary $variant_summary --submission_summary $submission_summary --outdir $out_dir --conceptID_list $conceptIDs --conflict_res $conflict_res + Rscript $BASEDIR/scripts/resolve-clinvar-intepretations.R --variant_summary $variant_summary --submission_summary $submission_summary --outdir $BASEDIR/refs --conceptID_list $conceptIDs --conflict_res $conflict_res fi - selected_submissions="$BASEDIR/results/ClinVar-selected-submissions.tsv" + selected_submissions="$BASEDIR/refs/resolved-clinvar-interpretations.tsv" fi @@ -184,51 +179,17 @@ intervar_input=$out_dir/${out_file}_intervar_filtered.txt # Run AutoGVP echo "Running AutoGVP..." -# Run appropriate Rscript based on workflow source (Cavatica vs. custom) -if [[ "$workflow" = "cavatica" ]] ; then - - if [[ -f $clinvar_file ]] ; then - - # Run AutoGVP from Cavatica workflow with provided clinvar vcf - Rscript $BASEDIR/scripts/02-annotate_variants_CAVATICA_input.R --vcf $autogvp_input \ - --clinvar $clinvar_file \ - --multianno $multianno_input \ - --intervar $intervar_input \ - --autopvs1 $autopvs1_input \ - --output $out_file \ - --outdir $out_dir \ - --variant_summary $selected_submissions - - else - - # Run AutoGVP from Cavatica workflow with clinvar annotation in sample vcf - Rscript $BASEDIR/scripts/02-annotate_variants_CAVATICA_input.R --vcf $autogvp_input \ - --multianno $multianno_input \ - --intervar $intervar_input \ - --autopvs1 $autopvs1_input \ - --output $out_file \ - --outdir $out_dir \ - --variant_summary $selected_submissions - - fi - - autogvp_output=${out_dir}/${out_file}".cavatica_input.annotations_report.abridged.tsv" - - else - - # Run AutoGVP from custom workflow - Rscript $BASEDIR/scripts/02-annotate_variants_custom_input.R --vcf $autogvp_input \ - --clinvar $clinvar_file \ +# Run AutoGVP variant annotation +Rscript $BASEDIR/scripts/02-annotate_variants.R --vcf $autogvp_input \ + --clinvar $selected_submissions \ --multianno $multianno_input \ --intervar $intervar_input \ --autopvs1 $autopvs1_input \ --output $out_file \ --outdir $out_dir \ - --variant_summary $selected_submissions \ + --sample_id $sample_id - autogvp_output=${out_dir}/${out_file}".custom_input.annotations_report.abridged.tsv" - -fi +autogvp_output=${out_dir}/${out_file}".annotations_report.abridged.tsv" # Parse vcf file so that info field values are in distinct columns diff --git a/scripts/02-annotate_variants_custom_input.R b/scripts/02-annotate_variants.R similarity index 56% rename from scripts/02-annotate_variants_custom_input.R rename to scripts/02-annotate_variants.R index e7989c0..a5c77fc 100644 --- a/scripts/02-annotate_variants_custom_input.R +++ b/scripts/02-annotate_variants.R @@ -1,22 +1,22 @@ ################################################################################ -# 02-annotate_variants_custom_input.R +# 02-annotate_variants.R # written by Ammar Naqvi & refactored by Saksham Phul +# updated 01/2026 by Patricia Sullivan # -# This script annotates variants based on clinVar and integrates a modified +# This script annotates variants based on ClinVar and integrates a modified # version of InterVar that involves adjustments of calls based on ACMG-AMP # guidelines # -# usage: Rscript 02-annotate_variants_custom_input.R --vcf +# usage: Rscript 02-annotate_variants.R --vcf +# --clinvar +# --multianno # --intervar # --autopvs1 -# --multianno -# --clinvar -# --variant_summary -# --submission_summary # --output +# --outdir +# --sample_id ################################################################################ -## load libraries suppressPackageStartupMessages({ library("tidyverse") library("optparse") @@ -46,15 +46,7 @@ option_list <- list( ), make_option(c("--clinvar"), type = "character", - help = "specific clinVar file (format: clinvar_yyyymmdd.vcf.gz)" - ), ## https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2022clinvar_20211225.vcf.gz - make_option(c("--variant_summary"), - type = "character", - help = "variant_summary file (format: variant_summary_2023-02.txt)" - ), - make_option(c("--summary_level_vcf"), - type = "character", default = "F", - help = "summary_level T/F" + help = "ClinVar resolved clinical significance file (format: resolved-clinvar-interpretations.tsv)" ), make_option(c("--output"), type = "character", default = "out", @@ -70,17 +62,14 @@ option_list <- list( ) ) - opt <- parse_args(OptionParser(option_list = option_list)) ## get input files from parameters (read) -input_clinVar_file <- opt$clinvar +input_vcf_file <- opt$vcf input_intervar_file <- opt$intervar input_autopvs1_file <- opt$autopvs1 -input_vcf_file <- opt$vcf -input_variant_summary <- opt$variant_summary input_multianno_file <- opt$multianno -summary_level <- opt$summary_level +input_clinVar_file <- opt$clinvar output_name <- opt$output results_dir <- opt$outdir sample_name <- opt$sample_id @@ -91,31 +80,11 @@ if (!dir.exists(results_dir)) { } ## output files -output_tab_abr_file <- paste0(output_name, ".custom_input.annotations_report.abridged.tsv") +output_tab_abr_file <- paste0(output_name, ".annotations_report.abridged.tsv") ## allocate more memory capacity Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2) -address_ambiguous_calls <- function(results_tab_abridged) { ## address ambiguous calls (non L/LB/P/LP/VUS) by taking the InterVar final call - - results_tab_abridged <- results_tab_abridged %>% - dplyr::mutate(new_call = case_when( - is.na(final_call) | (final_call != "Pathogenic" & - final_call != "Likely_benign" & final_call != "Likely_pathogenic" & - final_call != "Uncertain_significance" & final_call != "Benign" & - final_call != "Uncertain significance" & final_call != "Likely benign" & - final_call != "Likely pathogenic") ~ str_match(Intervar_evidence, "InterVar\\:\\s(\\w+\\s\\w+)*")[, 2], - TRUE ~ NA_character_ - )) %>% - dplyr::mutate(final_call = case_when( - !is.na(new_call) ~ new_call, - TRUE ~ final_call - )) %>% - dplyr::select(-new_call) - - return(results_tab_abridged) -} - # Open vcf and read lines until a line without '#' is found con <- file(input_vcf_file, "r") skip_lines <- 0 @@ -123,82 +92,19 @@ while (grepl("^#", readLines(con, n = 1))) { skip_lines <- skip_lines + 1 } -## make vcf dataframe and add vcf_if column -vcf_df <- vroom(input_vcf_file, skip = skip_lines, delim = "\t", col_names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"), trim_ws = TRUE, show_col_types = FALSE) %>% - mutate( - vcf_id = str_remove_all(paste(CHROM, "-", POS, "-", REF, "-", ALT), " "), - vcf_id = str_replace_all(vcf_id, "chr", "") - ) - -## add clinvar table to this (INFO) -clinvar_anno_vcf_df <- vroom(input_clinVar_file, comment = "#", delim = "\t", col_names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"), trim_ws = TRUE, show_col_types = FALSE) %>% - # add vcf id column - mutate(vcf_id = str_remove_all(paste(CHROM, "-", POS, "-", REF, "-", ALT), " ")) %>% - semi_join(vcf_df, by = "vcf_id") %>% - dplyr::mutate( - vcf_id = str_replace_all(vcf_id, "chr", ""), - # add star annotations to clinVar results table based on filters // ## default version - Stars = case_when( - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_single_submitter") ~ "1", - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_multiple_submitters") ~ "2", - str_detect(INFO, "CLNREVSTAT\\=reviewed_by_expert_panel") ~ "3", - str_detect(INFO, "CLNREVSTAT\\=practice_guideline") ~ "4", - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_conflicting_interpretations|CLNREVSTAT\\=criteria_provided,_conflicting_classifications") ~ "1NR", - str_detect(INFO, "CLNREVSTAT\\=no_assertion|CLNREVSTAT\\=no_interpretation|CLNREVSTAT\\=no_classification") ~ "0", - str_detect(INFO, "CLNREVSTAT") ~ "Other", - TRUE ~ NA_character_ - ), - ## extract the calls and put in own column - final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2] - ) - -if (sum(grepl("Other", clinvar_anno_vcf_df$Stars)) > 0) { - print("ERROR: there are ClinVar review statuses in data that are not recognized by AutoGVP as aligning with star values.\nPlease check that ClinVar review status values match those presented here: https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/.\nIf no discrepancies exist, please submit an issue through the AutoGVP github here: https://github.com/diskin-lab-chop/AutoGVP/issues") -} - -## store variants without clinvar info -clinvar_anti_join_vcf_df <- anti_join(vcf_df, clinvar_anno_vcf_df, by = "vcf_id") %>% +## retrieve and store input vcf into table +vcf_df <- vroom(input_vcf_file, skip = skip_lines, delim = "\t", col_names = c("CHROM", "START", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"), show_col_types = FALSE) %>% dplyr::mutate( - vcf_id = str_replace_all(vcf_id, "chr", ""), - CHROM = str_replace_all(CHROM, "chr", "") - ) %>% - dplyr::rename(rs_id = ID) - -## get latest calls from variant and submission summary files -variant_summary_df <- vroom(input_variant_summary, show_col_types = FALSE) %>% - filter(vcf_id %in% vcf_df$vcf_id) %>% - dplyr::select(-GeneSymbol) - -## filter only those variants that need consensus call and find call in submission table -entries_for_cc <- filter(clinvar_anno_vcf_df, Stars == "1NR", final_call_clinvar != "Benign", final_call_clinvar != "Pathogenic", final_call_clinvar != "Likely_benign", final_call_clinvar != "Likely_pathogenic", final_call_clinvar != "Uncertain_significance") - -entries_for_cc_in_submission <- inner_join(variant_summary_df, entries_for_cc, by = "vcf_id") %>% - dplyr::mutate(final_call_clinvar = ClinicalSignificance) %>% - dplyr::select(vcf_id, ClinicalSignificance, final_call_clinvar, Stars) - -## one Star cases that are “criteria_provided,_single_submitter” that do NOT have the B, LB, P, LP, VUS call must also go to intervar -## modified: any cases that do NOT have the B, LB, P, LP, VUS call must also go to intervar -additional_intervar_cases <- filter(clinvar_anno_vcf_df, final_call_clinvar != "Benign", final_call_clinvar != "Pathogenic", final_call_clinvar != "Likely_benign", final_call_clinvar != "Likely_pathogenic", final_call_clinvar != "Uncertain_significance") %>% - anti_join(entries_for_cc_in_submission, by = "vcf_id") %>% - anti_join(clinvar_anti_join_vcf_df, by = "vcf_id") - -clinvar_anti_join_vcf_df <- clinvar_anti_join_vcf_df %>% - mutate( - QUAL = as.character(QUAL), - POS = as.double(POS) + vcf_id = str_remove_all(paste(CHROM, "-", START, "-", REF, "-", ALT), " "), + vcf_id = str_replace(vcf_id, "chr", "") ) -## filter only those variant entries that need an InterVar run (No Star) and add the additional intervar cases from above -entries_for_intervar <- filter(clinvar_anno_vcf_df, Stars == "0" | is.na(Stars), na.rm = TRUE) %>% - bind_rows((additional_intervar_cases)) %>% - bind_rows(clinvar_anti_join_vcf_df) %>% - distinct() +## load in ClinVar resolved variant interpretations +clinvar_df <- vroom(input_clinVar_file, show_col_types = FALSE) %>% + dplyr::filter(vcf_id %in% vcf_df$vcf_id) +## TODO: Add "Stars" back in - -## get vcf ids that need intervar run -vcf_to_run_intervar <- entries_for_intervar$vcf_id - -## get multianno file to add correct vcf_id in intervar table +## get multianno file to add correct vcf_id in intervar table multianno_df <- vroom(input_multianno_file, delim = "\t", trim_ws = TRUE, col_names = TRUE, show_col_types = FALSE) %>% dplyr::select( -End, @@ -221,9 +127,8 @@ multianno_df <- vroom(input_multianno_file, delim = "\t", trim_ws = TRUE, col_na -contains(("Otherinfo")) ) - ## add intervar table -clinvar_anno_intervar_vcf_df <- vroom(input_intervar_file, delim = "\t", trim_ws = TRUE, col_names = TRUE, show_col_types = FALSE) %>% +intervar_df <- vroom(input_intervar_file, delim = "\t", trim_ws = TRUE, col_names = TRUE, show_col_types = FALSE) %>% dplyr::select( -`clinvar: Clinvar`, -contains(c("gnomad", "CADD", "Freq", "SCORE", "score", "ORPHA", "MIM", "rmsk", "GERP", "phylo")) @@ -233,24 +138,9 @@ clinvar_anno_intervar_vcf_df <- vroom(input_intervar_file, delim = "\t", trim_ws # remove coordiante, Otherinfo, gnomad, and clinVar-related columns dplyr::select( -`#Chr`, -Start, -End, -Alt, -Ref - ) - - -## combine the intervar and multianno tables by the appropriate vcf id -clinvar_anno_intervar_vcf_df <- clinvar_anno_intervar_vcf_df %>% + ) %>% dplyr::select(any_of(c("Ref.Gene", "InterVar: InterVar and Evidence", "var_id"))) %>% - left_join(multianno_df, by = "var_id") %>% - filter(vcf_id %in% c(clinvar_anno_vcf_df$vcf_id, entries_for_intervar$vcf_id)) - -## populate consensus call variants with invervar info -entries_for_cc_in_submission_w_intervar <- inner_join(clinvar_anno_intervar_vcf_df, entries_for_cc_in_submission, by = "vcf_id") %>% - dplyr::rename("Intervar_evidence" = `InterVar: InterVar and Evidence`) - -## remove variants that we found in the submission file that were 1NR for intervar adjustment -clinvar_anno_intervar_vcf_df <- clinvar_anno_intervar_vcf_df %>% ## add column for individual scores that will be re-calculated if we need to adjust using autoPVS1 result - - ## note: ignore PP5 score and BP6 score dplyr::mutate( evidencePVS1 = str_match(`InterVar: InterVar and Evidence`, "PVS1\\=(\\d+)\\s")[, 2], evidenceBA1 = str_match(`InterVar: InterVar and Evidence`, "BA1\\=(\\d+)\\s")[, 2], @@ -259,10 +149,17 @@ clinvar_anno_intervar_vcf_df <- clinvar_anno_intervar_vcf_df %>% evidencePP = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sPP\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ",")))[-5])), evidenceBS = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sBS\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ","))))), evidenceBP = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sBP\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ",")))[-6])) - ) %>% - ## merge dataframe with clinvar_anno_vcf_df above - left_join(vcf_df, by = "vcf_id") %>% - left_join(clinvar_anno_vcf_df[, c("vcf_id", "Stars", "final_call_clinvar")], by = "vcf_id") + ## note: ignore PP5 score and BP6 score + ) + +# combine intervar and multianno by var_id +intervar_multianno_df <- intervar_df %>% + left_join(multianno_df, by = "var_id") + +## combine the vcf, clinvar, intervar and multianno tables by the appropriate vcf id +clinvar_intervar_vcf_df <- vcf_df %>% + left_join(intervar_multianno_df, by = "vcf_id") %>% + left_join(clinvar_df %>% dplyr::select(any_of(c("vcf_id", "VariationID", "ClinicalSignificance", "ReviewStatus", "LastEvaluated", "clinvar_flag", "Origin", "OriginSimple"))), by = "vcf_id") ## autopvs1 results autopvs1_results <- vroom(input_autopvs1_file, col_names = TRUE, show_col_types = FALSE) %>% @@ -271,10 +168,11 @@ autopvs1_results <- vroom(input_autopvs1_file, col_names = TRUE, show_col_types vcf_id = str_remove_all(vcf_id, " "), vcf_id = str_replace_all(vcf_id, "chr", "") ) %>% - dplyr::filter(vcf_id %in% clinvar_anno_intervar_vcf_df$vcf_id) + dplyr::filter(vcf_id %in% clinvar_intervar_vcf_df$vcf_id) +## merge autopvs1_results with vcf data, and filter for those variants that need intervar run combined_tab_with_vcf_intervar <- autopvs1_results %>% - inner_join(clinvar_anno_intervar_vcf_df, by = "vcf_id") %>% + inner_join(clinvar_intervar_vcf_df, by = "vcf_id") %>% ## indicate if recalculated dplyr::mutate(intervar_adjusted = if_else((evidencePVS1 == 0), "No", "Yes")) %>% dplyr::mutate( @@ -390,13 +288,10 @@ combined_tab_with_vcf_intervar <- autopvs1_results %>% ## merge tables together (clinvar and intervar) and write to file -master_tab <- clinvar_anno_intervar_vcf_df %>% +master_tab <- clinvar_intervar_vcf_df %>% full_join(combined_tab_with_vcf_intervar[, grepl("vcf_id|intervar_adjusted|evidence|InterVar:|final_call_intervar", names(combined_tab_with_vcf_intervar))], by = "vcf_id") %>% - left_join(variant_summary_df[, c("vcf_id", "VariationID", "ClinicalSignificance", "ReviewStatus", "LastEvaluated", "clinvar_flag", "Origin", "OriginSimple")], by = "vcf_id") %>% - left_join(autopvs1_results, by = "vcf_id") - - -master_tab <- master_tab %>% + left_join(autopvs1_results, by = "vcf_id") %>% + # Make calls dplyr::mutate( intervar_adjusted = coalesce(intervar_adjusted, "No"), evidencePVS1 = coalesce(evidencePVS1.y, as.double(evidencePVS1.x)), @@ -406,52 +301,17 @@ master_tab <- master_tab %>% evidencePP = coalesce(as.double(evidencePP.y), as.double(evidencePP.x)), evidenceBS = coalesce(as.double(evidenceBS.y), as.double(evidenceBS.x)), evidenceBP = coalesce(as.double(evidenceBP.y), as.double(evidenceBP.x)), - Intervar_evidence = coalesce(`InterVar: InterVar and Evidence.x`, `InterVar: InterVar and Evidence.y`) - ) - -## remove older columns -master_tab <- master_tab %>% dplyr::select(-c( - evidencePVS1.x, evidencePVS1.y, evidenceBA1.x, evidenceBA1.y, evidencePS.x, evidencePS.y, evidencePM.x, evidencePM.y, evidencePP.x, evidencePP.y, evidenceBS.x, evidenceBS.y, evidenceBP.x, evidenceBP.y, - `InterVar: InterVar and Evidence.x`, `InterVar: InterVar and Evidence.y` -)) - -## reformat columns -master_tab <- full_join(master_tab, entries_for_cc_in_submission, by = "vcf_id") %>% - dplyr::mutate(final_call_clinvar = coalesce(final_call_clinvar.y, final_call_clinvar.x)) %>% - dplyr::mutate(final_call = if_else(Stars.x == "0" | is.na(Stars.x), final_call_intervar, final_call_clinvar)) %>% - full_join(entries_for_cc_in_submission_w_intervar[c("vcf_id", "Intervar_evidence")], by = "vcf_id") %>% - dplyr::mutate( - Intervar_evidence = coalesce(Intervar_evidence.y, Intervar_evidence.x), - ClinVar_ClinicalSignificance = coalesce(ClinicalSignificance.x, ClinicalSignificance.y), - Stars = case_when( - Stars.x == "1NR" ~ "1", - TRUE ~ Stars.x - ) + Intervar_evidence = coalesce(`InterVar: InterVar and Evidence.x`, `InterVar: InterVar and Evidence.y`), ) %>% - # Only retain ClinSig and ReviewStatus for variants with ClinVar annotation in VCF, and exclude for variants only found in submission summary file + dplyr::select(-c( + evidencePVS1.x, evidencePVS1.y, evidenceBA1.x, evidenceBA1.y, evidencePS.x, evidencePS.y, evidencePM.x, evidencePM.y, evidencePP.x, evidencePP.y, evidenceBS.x, evidenceBS.y, evidenceBP.x, evidenceBP.y, + `InterVar: InterVar and Evidence.x`, `InterVar: InterVar and Evidence.y` + )) %>% dplyr::mutate( - ClinVar_ClinicalSignificance = case_when( - !is.na(Stars) ~ ClinVar_ClinicalSignificance, - TRUE ~ NA_character_ - ), - ReviewStatus = case_when( - !is.na(Stars) ~ ReviewStatus, - TRUE ~ NA_character_ - ), - LastEvaluated = case_when( - !is.na(Stars) ~ LastEvaluated, - TRUE ~ NA_character_ - ) - ) %>% - dplyr::select( - -final_call_clinvar.x, -final_call_clinvar.y, - -Intervar_evidence.x, -Intervar_evidence.y, - -ClinicalSignificance.x, -ClinicalSignificance.y + final_call_clinvar = ClinicalSignificance, + final_call = coalesce(final_call_clinvar, final_call_intervar) ) -## address ambiguous calls (non L/LB/P/LP/VUS) by taking the InterVar final call -master_tab <- address_ambiguous_calls(master_tab) - ## fix spelling and nomenclature inconsistencies master_tab <- master_tab %>% dplyr::mutate( @@ -466,20 +326,22 @@ master_tab <- master_tab %>% ## add column indicating final call source master_tab <- master_tab %>% dplyr::mutate(Reasoning_for_call = case_when( - vcf_id %in% vcf_to_run_intervar ~ "InterVar", + is.na(final_call_clinvar) ~ "InterVar", TRUE ~ "ClinVar" )) %>% # modify `ClinVar_ClinicalSignificance` to equal `final_call` for ClinVar calls dplyr::mutate(ClinVar_ClinicalSignificance = case_when( Reasoning_for_call == "ClinVar" ~ final_call, - TRUE ~ str_replace(ClinVar_ClinicalSignificance, " ", "_") + TRUE ~ str_replace(final_call_clinvar, " ", "_") )) %>% dplyr::mutate(sample_id = sample_name) %>% - dplyr::relocate(any_of(c( - "sample_id", "CHROM", "POS", "START", "ID", "REF", "ALT", - "final_call", "Reasoning_for_call", - "Stars", "ClinVar_ClinicalSignificance", "Intervar_evidence" - ))) + dplyr::relocate( + any_of(c( + "sample_id", "CHROM", "START", "POS", "ID", "REF", "ALT", + "final_call", "Reasoning_for_call", + "Stars", "ClinVar_ClinicalSignificance", "Intervar_evidence" + )) + ) # write out to file master_tab %>% diff --git a/scripts/02-annotate_variants_CAVATICA_input.R b/scripts/02-annotate_variants_CAVATICA_input.R deleted file mode 100644 index 9ad5d69..0000000 --- a/scripts/02-annotate_variants_CAVATICA_input.R +++ /dev/null @@ -1,513 +0,0 @@ -################################################################################ -# 02-annotate_variants_CAVATICA_input.R -# written by Ammar Naqvi & refactored by Saksham Phul -# -# This script annotates variants based on clinVar and integrates a modified -# version of InterVar that involves adjustments of calls based on ACMG-AMP -# guidelines -# -# usage: Rscript 02-annotate_variants_CAVATICA_input.R --vcf -# --intervar -# --autopvs1 -# --clinvar 'clinvar_yyyymmdd.vcf.gz' -# --variant_summary -# --output -################################################################################ - -suppressPackageStartupMessages({ - library("tidyverse") - library("optparse") - library("vroom") -}) - -# Get `magrittr` pipe -`%>%` <- dplyr::`%>%` - -# parse parameters -option_list <- list( - make_option(c("--vcf"), - type = "character", - help = "Input vcf file with VEP annotations" - ), - make_option(c("--intervar"), - type = "character", - help = "input intervar file" - ), - make_option(c("--multianno"), - type = "character", - help = "input multianno file" - ), - make_option(c("--autopvs1"), - type = "character", - help = "input autopvs1 file" - ), - make_option(c("--clinvar"), - type = "character", - help = "specific clinVar file (format: clinvar_20211225.vcf.gz)" - ), ## https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2022clinvar_20211225.vcf.gz - make_option(c("--variant_summary"), - type = "character", - help = "variant_summary file (format: variant_summary_2023-02.txt)" - ), - make_option(c("--output"), - type = "character", default = "out", - help = "output name" - ), - make_option(c("--outdir"), - type = "character", default = "../results", - help = "output directory" - ), - make_option(c("--sample_id"), - type = "character", - help = "input sample bioassay id" - ) -) - -opt <- parse_args(OptionParser(option_list = option_list)) - -## get input files from parameters (read) -input_vcf_file <- opt$vcf -input_intervar_file <- opt$intervar -input_autopvs1_file <- opt$autopvs1 -input_multianno_file <- opt$multianno -input_clinVar_file <- opt$clinvar -input_variant_summary <- opt$variant_summary -summary_level <- opt$summary_level_vcf -output_name <- opt$output -results_dir <- opt$outdir -sample_name <- opt$sample_id - -# create results directory if it does not exist -if (!dir.exists(results_dir)) { - dir.create(results_dir) -} - -## output files -output_tab_abr_file <- paste0(output_name, ".cavatica_input.annotations_report.abridged.tsv") - -## allocate more memory capacity -Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2) - -address_ambiguous_calls <- function(results_tab_abridged) { ## address ambiguous calls (non L/LB/P/LP/VUS) by taking the InterVar final call - - results_tab_abridged <- results_tab_abridged %>% - dplyr::mutate(new_call = case_when( - is.na(final_call) | (final_call != "Pathogenic" & - final_call != "Likely_benign" & final_call != "Likely_pathogenic" & - final_call != "Uncertain_significance" & final_call != "Benign" & - final_call != "Uncertain significance" & final_call != "Likely benign" & - final_call != "Likely pathogenic") ~ str_match(Intervar_evidence, "InterVar\\:\\s(\\w+\\s\\w+)*")[, 2], - TRUE ~ NA_character_ - )) %>% - dplyr::mutate(final_call = case_when( - !is.na(new_call) ~ new_call, - TRUE ~ final_call - )) %>% - dplyr::select(-new_call) - - return(results_tab_abridged) -} - -# Open vcf and read lines until a line without '#' is found -con <- file(input_vcf_file, "r") -skip_lines <- 0 -while (grepl("^#", readLines(con, n = 1))) { - skip_lines <- skip_lines + 1 -} - -## retrieve and store clinVar input file into table -vcf_df <- vroom(input_vcf_file, skip = skip_lines, delim = "\t", col_names = c("CHROM", "START", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"), show_col_types = FALSE) %>% - dplyr::mutate( - vcf_id = str_remove_all(paste(CHROM, "-", START, "-", REF, "-", ALT), " "), - vcf_id = str_replace(vcf_id, "chr", "") - ) - -if (!is.null(input_clinVar_file)) { - ## add clinvar table to this (INFO) - clinvar_anno_vcf_df <- vroom(input_clinVar_file, comment = "#", delim = "\t", col_names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"), trim_ws = TRUE, show_col_types = FALSE) %>% - # add vcf id column - mutate(vcf_id = str_remove_all(paste(CHROM, "-", POS, "-", REF, "-", ALT), " ")) %>% - semi_join(vcf_df, by = "vcf_id") %>% - dplyr::mutate( - vcf_id = str_replace_all(vcf_id, "chr", ""), - # add star annotations to clinVar results table based on filters // ## default version - Stars = case_when( - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_single_submitter") ~ "1", - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_multiple_submitters") ~ "2", - str_detect(INFO, "CLNREVSTAT\\=reviewed_by_expert_panel") ~ "3", - str_detect(INFO, "CLNREVSTAT\\=practice_guideline") ~ "4", - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_conflicting_interpretations|CLNREVSTAT\\=criteria_provided,_conflicting_classifications") ~ "1NR", - str_detect(INFO, "CLNREVSTAT\\=no_assertion|CLNREVSTAT\\=no_interpretation|CLNREVSTAT\\=no_classification") ~ "0", - str_detect(INFO, "CLNREVSTAT") ~ "Other", - TRUE ~ NA_character_ - ), - ## extract the calls and put in own column - final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2] - ) -} else { - ## add column "vcf_id" to clinVar results in order to cross-reference with intervar and autopvs1 table - clinvar_anno_vcf_df <- vcf_df %>% - dplyr::mutate( - # add star annotations to clinVar results table based on filters // ## default version - Stars = case_when( - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_single_submitter") ~ "1", - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_multiple_submitters") ~ "2", - str_detect(INFO, "CLNREVSTAT\\=reviewed_by_expert_panel") ~ "3", - str_detect(INFO, "CLNREVSTAT\\=practice_guideline") ~ "4", - str_detect(INFO, "CLNREVSTAT\\=criteria_provided,_conflicting_interpretations|CLNREVSTAT\\=criteria_provided,_conflicting_classifications") ~ "1NR", - str_detect(INFO, "no_assertion|no_interpretation|no_classification") ~ "0", - str_detect(INFO, "CLNREVSTAT") ~ "Other", - TRUE ~ NA_character_ - ), - ## extract the calls and put in own column - final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2] - ) -} - -if (sum(grepl("Other", clinvar_anno_vcf_df$Stars)) > 0) { - print("ERROR: there are ClinVar review statuses in data that are not recognized by AutoGVP as aligning with star values.\nPlease check that ClinVar review status values match those presented here: https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/.\nIf no discrepancies exist, please submit an issue through the AutoGVP github here: https://github.com/diskin-lab-chop/AutoGVP/issues") -} - -## store variants without clinvar info -clinvar_anti_join_vcf_df <- anti_join(vcf_df, clinvar_anno_vcf_df, by = "vcf_id") %>% - dplyr::mutate( - vcf_id = str_replace_all(vcf_id, "chr", ""), - CHROM = str_replace_all(CHROM, "chr", "") - ) %>% - dplyr::rename(rs_id = ID) - -## get latest calls from variant and submission summary files -variant_summary_df <- vroom(input_variant_summary, show_col_types = FALSE) %>% - filter(vcf_id %in% vcf_df$vcf_id) %>% - dplyr::select(-GeneSymbol) - -## filter only those variants that need consensus call and find call in submission table -entries_for_cc <- filter(clinvar_anno_vcf_df, Stars == "1NR", final_call_clinvar != "Benign", final_call_clinvar != "Pathogenic", final_call_clinvar != "Likely_benign", final_call_clinvar != "Likely_pathogenic", final_call_clinvar != "Uncertain_significance") - -entries_for_cc_in_submission <- inner_join(variant_summary_df, entries_for_cc, by = "vcf_id") %>% - dplyr::mutate(final_call_clinvar = ClinicalSignificance) %>% - dplyr::select(vcf_id, ClinicalSignificance, final_call_clinvar, Stars) - -## one Star cases that are “criteria_provided,_single_submitter” that do NOT have the B, LB, P, LP, VUS call must also go to intervar -## modified: any cases that do NOT have the B, LB, P, LP, VUS call must also go to intervar -additional_intervar_cases <- filter(clinvar_anno_vcf_df, final_call_clinvar != "Benign", final_call_clinvar != "Pathogenic", final_call_clinvar != "Likely_benign", final_call_clinvar != "Likely_pathogenic", final_call_clinvar != "Uncertain_significance") %>% - anti_join(entries_for_cc_in_submission, by = "vcf_id") %>% - anti_join(clinvar_anti_join_vcf_df, by = "vcf_id") - -if (nrow(clinvar_anti_join_vcf_df) > 0) { - clinvar_anti_join_vcf_df <- clinvar_anti_join_vcf_df %>% - mutate( - QUAL = as.character(QUAL), - # POS = as.double(POS) - ) -} - -## filter only those variant entries that need an InterVar run (No Star) and add the additional intervar cases from above -entries_for_intervar <- filter(clinvar_anno_vcf_df, Stars == "0" | is.na(Stars), na.rm = TRUE) %>% - bind_rows((additional_intervar_cases)) %>% - bind_rows(clinvar_anti_join_vcf_df) %>% - distinct() - -vcf_to_run_intervar <- entries_for_intervar$vcf_id - -## get multianno file to add correct vcf_id in intervar table -multianno_df <- vroom(input_multianno_file, delim = "\t", trim_ws = TRUE, col_names = TRUE, show_col_types = FALSE) %>% - dplyr::select( - -End, - -contains(c( - "AF", - "gnomad", "CLN", - "score", "pred", "CADD", "Eigen", - "100way", "30way", "GTEx" - )) - ) %>% - mutate( - vcf_id = str_remove_all(paste(Chr, "-", Otherinfo5, "-", Otherinfo7, "-", Otherinfo8), " "), - vcf_id = str_replace(vcf_id, "chr", ""), - var_id = str_remove_all(paste(Chr, "-", Start, "-", Ref, "-", Alt), " "), - var_id = str_replace(var_id, "chr", "") - ) %>% - # remove coordiante, Otherinfo, gnomad, and clinVar-related columns - dplyr::select( - -Chr, -Ref, -Alt, - -contains(("Otherinfo")) - ) - - -## add intervar table -clinvar_anno_intervar_vcf_df <- vroom(input_intervar_file, delim = "\t", trim_ws = TRUE, col_names = TRUE, show_col_types = FALSE) %>% - dplyr::select( - -`clinvar: Clinvar`, - -contains(c("gnomad", "CADD", "Freq", "SCORE", "score", "ORPHA", "MIM", "rmsk", "GERP", "phylo")) - ) %>% - dplyr::mutate(var_id = paste0(`#Chr`, "-", Start, "-", Ref, "-", Alt)) %>% - distinct(var_id, .keep_all = T) %>% - # remove coordiante, Otherinfo, gnomad, and clinVar-related columns - dplyr::select( - -`#Chr`, -Start, -End, -Alt, -Ref - ) - - -## combine the intervar and multianno tables by the appropriate vcf id -clinvar_anno_intervar_vcf_df <- clinvar_anno_intervar_vcf_df %>% - dplyr::select(any_of(c("Ref.Gene", "InterVar: InterVar and Evidence", "var_id"))) %>% - left_join(multianno_df, by = "var_id") %>% - filter(vcf_id %in% c(clinvar_anno_vcf_df$vcf_id, entries_for_intervar$vcf_id)) - -## populate consensus call variants with intervar info -entries_for_cc_in_submission_w_intervar <- inner_join(clinvar_anno_intervar_vcf_df, entries_for_cc_in_submission, by = "vcf_id") %>% - dplyr::rename("Intervar_evidence" = `InterVar: InterVar and Evidence`) - - -clinvar_anno_intervar_vcf_df <- clinvar_anno_intervar_vcf_df %>% - ## add column for individual scores that will be re-calculated if we need to adjust using autoPVS1 result - - ## note: ignore PP5 score and BP6 score - dplyr::mutate( - evidencePVS1 = str_match(`InterVar: InterVar and Evidence`, "PVS1\\=(\\d+)\\s")[, 2], - evidenceBA1 = str_match(`InterVar: InterVar and Evidence`, "BA1\\=(\\d+)\\s")[, 2], - evidencePS = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sPS\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ","))))), - evidencePM = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sPM\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ","))))), - evidencePP = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sPP\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ",")))[-5])), - evidenceBS = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sBS\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ","))))), - evidenceBP = map_dbl(str_match(`InterVar: InterVar and Evidence`, "\\sBP\\=\\[([^]]+)\\]")[, 2], function(x) sum(as.integer(unlist(str_split(x, ",")))[-6])) - ) %>% - ## merge dataframe with clinvar_anno_vcf_df above - left_join(vcf_df, by = "vcf_id") %>% - left_join(clinvar_anno_vcf_df %>% select(-one_of(c("CHROM", "START", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "FORMAT", "Sample", "INFO"))), by = "vcf_id") - - -## autopvs1 results -autopvs1_results <- vroom(input_autopvs1_file, col_names = TRUE, show_col_types = FALSE) %>% - dplyr::select(vcf_id, Feature, criterion) %>% - mutate( - vcf_id = str_remove_all(vcf_id, " "), - vcf_id = str_replace_all(vcf_id, "chr", "") - ) %>% - dplyr::filter(vcf_id %in% clinvar_anno_intervar_vcf_df$vcf_id) - - -## merge autopvs1_results with vcf data, and filter for those variants that need intervar run -combined_tab_with_vcf_intervar <- autopvs1_results %>% - inner_join(clinvar_anno_intervar_vcf_df, by = "vcf_id") %>% - ## indicate if recalculated - dplyr::mutate(intervar_adjusted = if_else((evidencePVS1 == 0), "No", "Yes")) %>% - dplyr::mutate( - ## criteria to check intervar/autopvs1 to re-calculate and create a score column that will inform the new re-calculated final call - # if criterion is NF1|SS1|DEL1|DEL2|DUP1|IC1 then PVS1=1 - evidencePVS1 = if_else((criterion == "NF1" | criterion == "SS1" | - criterion == "DEL1" | criterion == "DEL2" | - criterion == "DUP1" | criterion == "IC1") & evidencePVS1 == 1, "1", evidencePVS1), - - # if criterion is NF3|NF5|SS3|SS5|SS8|SS10|DEL4|DEL8|DEL6|DEL10|DUP3|IC2 then PVS1 = 0; PS = PS+1 - evidencePS = if_else((criterion == "NF3" | criterion == "NF5" | - criterion == "SS3" | criterion == "SS5" | - criterion == "SS8" | criterion == "SS10" | criterion == "DEL4" | - criterion == "DEL8" | criterion == "DEL6" | - criterion == "DEL10" | criterion == "DUP3" | - criterion == "IC2") & evidencePVS1 == 1, as.numeric(evidencePS) + 1, as.double(evidencePS)), - evidencePVS1 = if_else((criterion == "NF3" | criterion == "NF5" | - criterion == "SS3" | criterion == "SS5" | - criterion == "SS8" | criterion == "SS10" | criterion == "DEL4" | - criterion == "DEL8" | criterion == "DEL6" | - criterion == "DEL10" | criterion == "DUP3" | - criterion == "IC2") & evidencePVS1 == 1, "0", evidencePVS1), - - # if criterion is NF6|SS6|SS9|DEL7|DEL11|IC3 then PVS1 = 0; PM = PM+1; - evidencePM = if_else((criterion == "NF6" | criterion == "SS6" | - criterion == "SS9" | criterion == "DEL7" | - criterion == "DEL11" | criterion == "IC3") & evidencePVS1 == 1, as.numeric(evidencePM) + 1, as.double(evidencePM)), - evidencePVS1 = if_else((criterion == "NF6" | criterion == "SS6" | - criterion == "SS9" | criterion == "DEL7" | - criterion == "DEL11" | criterion == "IC3") & evidencePVS1 == 1, "0", evidencePVS1), - - # if criterion is IC4 then PVS1 = 0; PP = PP+1; - evidencePP = if_else((criterion == "IC4") & evidencePVS1 == 1, as.numeric(evidencePP) + 1, as.double(evidencePP)), - evidencePVS1 = if_else((criterion == "IC4") & evidencePVS1 == 1, "0", evidencePVS1), - - # if criterion is na|NF0|NF2|NF4|SS2|SS4|SS7|DEL3|DEL5|DEL9|DUP2|DUP4|DUP5|IC5 then PVS1 = 0; - evidencePVS1 = if_else((criterion == "na" | criterion == "NF0" | criterion == "NF2" | criterion == "NF4" | - criterion == "SS2" | criterion == "SS4" | criterion == "SS7" | - criterion == "DEL3" | criterion == "DEL5" | criterion == "DEL9" | - criterion == "DUP2" | criterion == "DUP4" | criterion == "DUP5" | - criterion == "IC5") & evidencePVS1 == 1, 0, as.double(evidencePVS1)), - - ## adjust variables based on given rules described in README - final_call_intervar = ifelse(intervar_adjusted == "No", - sub(".*InterVar: ", "", sub("\\ P.*", "", `InterVar: InterVar and Evidence`)), - ifelse((evidencePVS1 == 1) & (evidencePVS1 == 1 & - ((evidencePS >= 1) | - (evidencePM >= 2) | - (evidencePM == 1 & evidencePP == 1) | - (evidencePP >= 2)) & - ((evidenceBA1) == 1 | - (evidenceBS >= 2) | - (evidenceBP >= 2) | - (evidenceBS >= 1 & evidenceBP >= 1) | - (evidenceBA1 == 1 & (evidenceBS >= 1 | evidenceBP >= 1)))), "Uncertain_significance", - ifelse(((evidencePVS1 == 1) & (evidencePS >= 2) & - ((evidenceBA1) == 1 | - (evidenceBS >= 2) | - (evidenceBP >= 2) | - (evidenceBS >= 1 & evidenceBP >= 1) | - (evidenceBA1 == 1 & (evidenceBS >= 1 | evidenceBP >= 1)))), "Uncertain_significance", - ifelse(((evidencePVS1 == 1) & (evidencePS == 1 & - (evidencePM >= 3 | - (evidencePM == 2 & evidencePP >= 2) | - (evidencePM == 1 & evidencePP >= 4))) & - ((evidenceBA1) == 1 | - (evidenceBS >= 2) | - (evidenceBP >= 2) | - (evidenceBS >= 1 & evidenceBP >= 1) | - (evidenceBA1 == 1 & (evidenceBS >= 1 | evidenceBP >= 1)))), "Uncertain_significance", - ifelse((((evidencePVS1 == 1) & (evidencePVS1 == 1 & evidencePM == 1) | - (evidencePS == 1 & evidencePM >= 1) | - (evidencePS == 1 & evidencePP >= 2) | - (evidencePM >= 3) | - (evidencePM == 2 & evidencePP >= 2) | - (evidencePM == 1 & evidencePP >= 4)) & - ((evidenceBA1) == 1 | - (evidenceBS >= 2) | - (evidenceBP >= 2) | - (evidenceBS >= 1 & evidenceBP >= 1) | - (evidenceBA1 == 1 & (evidenceBS >= 1 | evidenceBP >= 1)))), "Uncertain_significance", - ifelse((evidencePVS1 == 1) & (evidencePVS1 == 1 & - ((evidencePS >= 1) | - (evidencePM >= 2) | - (evidencePM == 1 & evidencePP == 1) | - (evidencePP >= 2))), "Pathogenic", - ifelse((evidencePVS1 == 1) & (evidencePS >= 2), "Pathogenic", - ifelse((evidencePVS1 == 0) & (evidencePS == 1 & - (evidencePM >= 3 | - (evidencePM == 2 & evidencePP >= 2) | - (evidencePM == 1 & evidencePP >= 4))), "Pathogenic", - ifelse((evidencePVS1 == 1) & (evidencePVS1 == 1 & evidencePM == 1) | - (evidencePS == 1 & evidencePM >= 1) | - (evidencePS == 1 & evidencePP >= 2) | - (evidencePM >= 3) | - (evidencePM == 2 & evidencePP >= 2) | - (evidencePM == 1 & evidencePP >= 4), "Likely_pathogenic", - ifelse((evidencePVS1 == 1) & (evidenceBA1 == 1) | - (evidenceBS >= 2), "Benign", - ifelse((evidencePVS1 == 1) & (evidenceBS == 1 & evidenceBP == 1) | - (evidenceBP >= 2), "Likely_benign", "Uncertain_significance") - ) - ) - ) - ) - ) - ) - ) - ) - ) - ) - ) - - -## merge tables together (clinvar and intervar) and write to file -master_tab <- clinvar_anno_intervar_vcf_df %>% - full_join(combined_tab_with_vcf_intervar[, grepl("vcf_id|intervar_adjusted|evidence|InterVar:|final_call_intervar", names(combined_tab_with_vcf_intervar))], by = "vcf_id") %>% - left_join(variant_summary_df[, c("vcf_id", "VariationID", "ClinicalSignificance", "ReviewStatus", "LastEvaluated", "clinvar_flag", "Origin", "OriginSimple")], by = "vcf_id") %>% - left_join(autopvs1_results, by = "vcf_id") - - -master_tab <- master_tab %>% - dplyr::mutate( - intervar_adjusted = coalesce(intervar_adjusted, "No"), - evidencePVS1 = coalesce(evidencePVS1.y, as.double(evidencePVS1.x)), - evidenceBA1 = coalesce(as.double(evidenceBA1.y), as.double(evidenceBA1.x)), - evidencePS = coalesce(as.double(evidencePS.y), as.double(evidencePS.x)), - evidencePM = coalesce(as.double(evidencePM.y), as.double(evidencePM.x)), - evidencePP = coalesce(as.double(evidencePP.y), as.double(evidencePP.x)), - evidenceBS = coalesce(as.double(evidenceBS.y), as.double(evidenceBS.x)), - evidenceBP = coalesce(as.double(evidenceBP.y), as.double(evidenceBP.x)), - Intervar_evidence = coalesce(`InterVar: InterVar and Evidence.x`, `InterVar: InterVar and Evidence.y`), - ) - -## remove older columns -master_tab <- master_tab %>% dplyr::select(-c( - evidencePVS1.x, evidencePVS1.y, evidenceBA1.x, evidenceBA1.y, evidencePS.x, evidencePS.y, evidencePM.x, evidencePM.y, evidencePP.x, evidencePP.y, evidenceBS.x, evidenceBS.y, evidenceBP.x, evidenceBP.y, - `InterVar: InterVar and Evidence.x`, `InterVar: InterVar and Evidence.y` -)) - -## reformat columns -master_tab <- full_join(master_tab, entries_for_cc_in_submission, by = "vcf_id") %>% - dplyr::mutate(final_call_clinvar = coalesce(final_call_clinvar.y, final_call_clinvar.x)) %>% - dplyr::mutate(final_call = if_else(Stars.x == "0" | is.na(Stars.x), final_call_intervar, final_call_clinvar)) %>% - full_join(entries_for_cc_in_submission_w_intervar[c("vcf_id", "Intervar_evidence")], by = "vcf_id") %>% - dplyr::mutate( - Intervar_evidence = coalesce(Intervar_evidence.y, Intervar_evidence.x), - ClinVar_ClinicalSignificance = coalesce(ClinicalSignificance.x, ClinicalSignificance.y), - Stars = case_when( - Stars.x == "1NR" ~ "1", - TRUE ~ Stars.x - ) - ) %>% - # Only retain ClinSig and ReviewStatus for variants with ClinVar annotation in VCF, and exclude for variants only found in submission summary file - dplyr::mutate( - ClinVar_ClinicalSignificance = case_when( - !is.na(Stars) ~ ClinVar_ClinicalSignificance, - TRUE ~ NA_character_ - ), - ReviewStatus = case_when( - !is.na(Stars) ~ ReviewStatus, - TRUE ~ NA_character_ - ), - LastEvaluated = case_when( - !is.na(Stars) ~ LastEvaluated, - TRUE ~ NA_character_ - ) - ) %>% - dplyr::select( - -any_of( - c( - "final_call_clinvar.x", "final_call_clinvar.y", - "Stars.x", "Stars.y", - "Intervar_evidence.x", "Intervar_evidence.y", - "INFO", "ClinicalSignificance.x", "ClinicalSignificance.y" - ) - ) - ) - -## address ambiguous calls (non L/LB/P/LP/VUS) by taking the InterVar final call -master_tab <- address_ambiguous_calls(master_tab) - -## fix spelling and nomenclature inconsistencies -master_tab <- master_tab %>% - dplyr::mutate( - final_call = replace(final_call, final_call == "Likely benign", "Likely_benign"), - final_call = replace(final_call, final_call == "Uncertain significance", "Uncertain_significance"), - final_call = replace(final_call, final_call == "Benign PVS1", "Benign"), - final_call = replace(final_call, final_call == "Pathogenic PVS1", "Pathogenic"), - final_call = replace(final_call, final_call == "Likely pathogenic", "Likely_pathogenic") - ) %>% - distinct() - -## add column indicating final call source -master_tab <- master_tab %>% - dplyr::mutate(Reasoning_for_call = case_when( - vcf_id %in% vcf_to_run_intervar ~ "InterVar", - TRUE ~ "ClinVar" - )) %>% - # modify `ClinVar_ClinicalSignificance` to equal `final_call` for ClinVar calls - dplyr::mutate(ClinVar_ClinicalSignificance = case_when( - Reasoning_for_call == "ClinVar" ~ final_call, - TRUE ~ str_replace(ClinVar_ClinicalSignificance, " ", "_") - )) %>% - dplyr::mutate(sample_id = sample_name) %>% - dplyr::relocate( - any_of(c( - "sample_id", "CHROM", "START", "POS", "ID", "REF", "ALT", - "final_call", "Reasoning_for_call", - "Stars", "ClinVar_ClinicalSignificance", "Intervar_evidence" - )) - ) - -# write out to file -master_tab %>% - write_tsv( - file.path(results_dir, output_tab_abr_file), - append = FALSE, - quote = "none", - col_names = TRUE - ) diff --git a/scripts/select-clinVar-submissions.R b/scripts/resolve-clinvar-intepretations.R similarity index 70% rename from scripts/select-clinVar-submissions.R rename to scripts/resolve-clinvar-intepretations.R index 49d740f..011c395 100644 --- a/scripts/select-clinVar-submissions.R +++ b/scripts/resolve-clinvar-intepretations.R @@ -1,13 +1,13 @@ ################################################################################ -# select-clinVar-submissions.R +# resolve-clinvar-intepretations.R # written by Ryan Corbett, Patricia Sullivan # # This script selects unique ClinVar variant submission calls based on a list of # predetermined criteria, to be used in AutoGVP for ClinVar variants that need # resolving due to conflicting calls # -# usage: select-clinVar-submissions.R --variant_summary -# --submission_summary +# usage: resolve-clinvar-intepretations.R --variant_summary +# --submission_summary # # NOTE: this script must be run BEFORE running run_autogvp.sh ################################################################################ @@ -57,12 +57,18 @@ results_dir <- opt$outdir conceptID_file <- opt$conceptID_list conflict_res <- opt$conflict_res +# input_submission_file <- file.path("../data/submission_summary_20260104.txt.gz") +# input_variant_summary <- file.path("../data/variant_summary_20260104.txt.gz") +# results_dir <- file.path("../refs/") +# conceptID_file <- file.path("../refs/clinvar_cancer_concept_ids_20260130.txt") +# conflict_res <- "latest" + ## load variant summary file, which reports latest ClinVar consensus calls for each variant variant_summary_df <- vroom(input_variant_summary, - delim = "\t", - col_types = c(ReferenceAlleleVCF = "c", AlternateAlleleVCF = "c", PositionVCF = "i", VariationID = "n"), - show_col_types = FALSE - ) %>% + delim = "\t", + col_types = c(ReferenceAlleleVCF = "c", AlternateAlleleVCF = "c", PositionVCF = "i", VariationID = "n"), + show_col_types = FALSE +) %>% dplyr::rename( AlleleID = dplyr::any_of(c("AlleleID", "#AlleleID")) ) %>% @@ -84,11 +90,31 @@ variant_summary_df <- vroom(input_variant_summary, dplyr::filter(!ReviewStatus %in% c( "no assertion provided", "no assertion criteria provided", - "no classification for the individual variant", + "no classification for the individual variant", "no classification provided", "no classification for the single variant", "no classifications from unflagged records" - )) + )) %>% + dplyr::mutate( + ClinSig_resolved = case_when( + grepl("Conflicting classifications of pathogenicity", ClinicalSignificance) ~ "Conflicting classifications of pathogenicity", + grepl("Uncertain significance", ClinicalSignificance) ~ "Uncertain significance", + grepl("Pathogenic/Likely pathogenic", ClinicalSignificance) ~ "Pathogenic/Likely pathogenic", + grepl("Benign/Likely benign", ClinicalSignificance) ~ "Benign/Likely benign", + grepl("Likely pathogenic", ClinicalSignificance) ~ "Likely pathogenic", + grepl("Likely benign", ClinicalSignificance) ~ "Likely benign", + grepl("Pathogenic", ClinicalSignificance) ~ "Pathogenic", + grepl("Benign", ClinicalSignificance) ~ "Benign", + TRUE ~ NA + ), + Stars = case_when( + ReviewStatus == "practice guideline" ~ 4, + ReviewStatus == "reviewed by expert panel" ~ 3, + ReviewStatus == "criteria provided, multiple submitters, no conflicts" ~ 2, + ReviewStatus %in% c("criteria provided, conflicting classifications", "criteria provided, single submitter") ~ 1, + TRUE ~ NA + ) + ) # Load ClinVar submission summary file, which reports all submissions for each ClinVar variant @@ -102,10 +128,10 @@ while (grepl("^#", readLines(con, n = 1))) { # Load submission file while skipping number of lines determined above submission_summary_df <- vroom(input_submission_file, - skip = skip_lines, - delim = "\t", - show_col_types = F - ) %>% + skip = skip_lines, + delim = "\t", + show_col_types = F +) %>% dplyr::rename( VariationID = dplyr::any_of(c("VariationID", "#VariationID")) ) @@ -155,16 +181,16 @@ submission_summary_df <- submission_summary_df %>% ) %>% dplyr::filter( !ReviewStatus %in% c( - "no assertion provided", + "no assertion provided", "no assertion criteria provided", - "no classification provided", + "no classification provided", "flagged submission" ), ClinicalSignificance %in% c( - "Pathogenic", - "Likely pathogenic", - "Benign", - "Likely benign", + "Pathogenic", + "Likely pathogenic", + "Benign", + "Likely benign", "Uncertain significance" ), # Filter on contributing records if this column is present in supplied file @@ -178,41 +204,22 @@ submission_summary_df <- submission_summary_df %>% # merge submission_summary and variant_summary info submission_merged_df <- submission_summary_df %>% dplyr::rename("LastEvaluated" = DateLastEvaluated) %>% - left_join(variant_summary_df, + inner_join(variant_summary_df, by = "VariationID", - multiple = "all", + multiple = "all", suffix = c("_sub", "_var"), relationship = "many-to-many" ) %>% - dplyr::mutate(LastEvaluated = coalesce(LastEvaluated_sub, LastEvaluated_var)) %>% - dplyr::filter(!is.na(vcf_id)) - -# Extract submissions that match variant consensus call and are 2+ stars -variants_no_conflict_expert <- submission_merged_df %>% - filter(ReviewStatus_var %in% c( - "practice guideline", # 4 stars - "reviewed by expert panel", # 3 stars - "criteria provided, multiple submitters, no conflicts" # 2 stars - )) %>% + dplyr::mutate(LastEvaluated = coalesce(LastEvaluated_sub, LastEvaluated_var)) + +# Extract submissions that have no conflicts +variants_resolved <- submission_merged_df %>% + filter(ClinSig_resolved != "Conflicting classifications of pathogenicity") %>% dplyr::arrange(desc(mdy(LastEvaluated))) %>% distinct(VariationID, .keep_all = T) %>% + mutate(ClinSig_report = ClinSig_resolved) %>% arrange(VariationID) -# Identify VariationIDs with no ClinSig conflicts between variant and submission summary at date last evaluated -variants_no_conflicts <- submission_merged_df %>% - dplyr::filter(!VariationID %in% variants_no_conflict_expert$VariationID) %>% - dplyr::filter( - ClinicalSignificance_sub == ClinicalSignificance_var | - is.na(ClinicalSignificance_var) | - (grepl("Pathogenic|Likely pathogenic", ClinicalSignificance_sub) & grepl("Pathogenic|Likely pathogenic", ClinicalSignificance_var)) | - (grepl("Benign|Likely benign", ClinicalSignificance_sub) & grepl("Benign|Likely benign", ClinicalSignificance_var)) | - (grepl("Uncertain significance", ClinicalSignificance_sub) & grepl("Uncertain significance", ClinicalSignificance_var)) - ) %>% - dplyr::arrange(desc(mdy(LastEvaluated))) %>% - distinct(VariationID, .keep_all = T) %>% - arrange(VariationID) %>% - # append variants with >= 2 stars - bind_rows(variants_no_conflict_expert) # IF list of concept IDs provided -- filter remaining submissions to only those associated with concept IDs, and resolve conflicts by consensus, latest date, or severity if (!is.null(conceptID_file)) { @@ -221,11 +228,13 @@ if (!is.null(conceptID_file)) { # Filter variants for those associated with concept IDs variants_with_conceptIDs <- submission_merged_df %>% - dplyr::filter(!VariationID %in% variants_no_conflicts$VariationID) %>% + dplyr::filter(ClinSig_resolved == "Conflicting classifications of pathogenicity") %>% dplyr::mutate(conceptID = unlist(lapply(strsplit(ReportedPhenotypeInfo, ":"), function(x) x[[1]]))) %>% dplyr::filter(conceptID %in% conceptIDs) %>% dplyr::select(-conceptID) + length(unique(variants_with_conceptIDs$VariationID)) + # if no. submissions remaining = 1, add to no conflict variants variants_no_conflicts_conceptID <- variants_with_conceptIDs %>% count(VariationID) %>% @@ -233,18 +242,22 @@ if (!is.null(conceptID_file)) { pull(VariationID) # add variants with one submission to variants_no_conflicts - variants_no_conflicts <- variants_with_conceptIDs %>% + variants_resolved <- variants_with_conceptIDs %>% filter(VariationID %in% variants_no_conflicts_conceptID) %>% - dplyr::mutate(ReviewStatus_var = glue::glue("{ReviewStatus_var}, single submission associated with conceptIDs")) %>% - bind_rows(variants_no_conflicts) + dplyr::mutate( + ClinSig_report = ClinicalSignificance_sub, + ReviewStatus_var = glue::glue("{ReviewStatus_var}, single submission associated with conceptIDs") + ) %>% + bind_rows(variants_resolved) # Identify variants with consensus calls consensus_calls_conceptIDs <- variants_with_conceptIDs %>% + filter(!VariationID %in% variants_no_conflicts_conceptID) %>% # Here, we will group B+LB and P+LP together to identify majority calls dplyr::mutate(ClinSig = case_when( - ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "P/LP", - ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "B/LB", - ClinicalSignificance_sub == "Uncertain significance" ~ "VUS", + ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "Pathogenic/Likely pathogenic", + ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "Benign/Likely benign", + ClinicalSignificance_sub == "Uncertain significance" ~ "Uncertain significance", TRUE ~ NA_character_ )) %>% count(VariationID, ClinSig) %>% @@ -253,11 +266,11 @@ if (!is.null(conceptID_file)) { dplyr::filter(!VariationID %in% VariationID[duplicated(VariationID)]) # extract variants resolved through consensus calling - variants_consensus_call_conceptIDs <- variants_with_conceptIDs %>% + variants_resolved <- variants_with_conceptIDs %>% dplyr::mutate(ClinSig = case_when( - ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "P/LP", - ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "B/LB", - ClinicalSignificance_sub == "Uncertain significance" ~ "VUS", + ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "Pathogenic/Likely pathogenic", + ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "Benign/Likely benign", + ClinicalSignificance_sub == "Uncertain significance" ~ "Uncertain significance", TRUE ~ NA_character_ )) %>% dplyr::filter(glue::glue("{VariationID}-{ClinSig}") %in% glue::glue("{consensus_calls_conceptIDs$VariationID}-{consensus_calls_conceptIDs$ClinSig}")) %>% @@ -265,52 +278,57 @@ if (!is.null(conceptID_file)) { distinct(VariationID, .keep_all = T) %>% arrange(VariationID) %>% dplyr::mutate(ReviewStatus_var = glue::glue("{ReviewStatus_var}, submissions associated with conceptIDs, consensus call taken")) %>% - dplyr::select(-ClinSig) + dplyr::rename(ClinSig_report = ClinSig) %>% + bind_rows(variants_resolved) # Resolve remaining conflicts for variants associated with concept IDs, either by taking date last evaluated or most severe call if (conflict_res == "latest") { - variants_resolved_conceptIDs <- variants_with_conceptIDs %>% - dplyr::filter(!VariationID %in% variants_consensus_call_conceptIDs$VariationID) %>% + variants_resolved <- variants_with_conceptIDs %>% + dplyr::filter( + !VariationID %in% consensus_calls_conceptIDs, + !VariationID %in% variants_no_conflicts_conceptID + ) %>% dplyr::arrange(desc(mdy(LastEvaluated_sub))) %>% distinct(VariationID, .keep_all = T) %>% - dplyr::mutate(ReviewStatus_var = glue::glue("{ReviewStatus_var}, submissions associated with conceptIDs, latest date evaluated taken")) + dplyr::mutate( + ClinSig_report = ClinicalSignificance_sub, + ReviewStatus_var = glue::glue("{ReviewStatus_var}, submissions associated with conceptIDs, latest date evaluated taken") + ) %>% + bind_rows(variants_resolved) } if (conflict_res == "most_severe") { - variants_resolved_conceptIDs <- variants_with_conceptIDs %>% + variants_resolved <- variants_with_conceptIDs %>% dplyr::filter(!VariationID %in% variants_consensus_call_conceptIDs$VariationID) %>% dplyr::mutate(ClinicalSignificance_sub = fct_relevel( ClinicalSignificance_sub, c( "Pathogenic", "Likely pathogenic", - "Uncertain significance", "Likely benign", - "Benign" + "Uncertain significance", + "Likely benign", "Benign" ) )) %>% dplyr::arrange(VariationID, ClinicalSignificance_sub, desc(mdy(LastEvaluated_sub))) %>% distinct(VariationID, .keep_all = T) %>% - dplyr::mutate(ReviewStatus_var = glue::glue("{ReviewStatus_var}, submissions associated with conceptIDs, most severe call at latest date taken")) + dplyr::mutate( + ClinSig_report = ClinicalSignificance_sub, + ReviewStatus_var = glue::glue("{ReviewStatus_var}, submissions associated with conceptIDs, most severe call at latest date taken") + ) %>% + bind_rows(variants_resolved) } - - # add other resolved submissions to variants_no_conflicts - variants_no_conflicts <- variants_no_conflicts %>% - bind_rows( - variants_consensus_call_conceptIDs, - variants_resolved_conceptIDs - ) } # Extract remaining variants with conflicts conflicting_variants <- submission_merged_df %>% - dplyr::filter(!VariationID %in% variants_no_conflicts$VariationID) + dplyr::filter(!VariationID %in% variants_resolved$VariationID) # Identify cases where a majority pathogenicity call has been made for variants consensus_calls <- conflicting_variants %>% # Here, we will group B+LB and P+LP together to identify majority calls dplyr::mutate(ClinSig = case_when( - ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "P/LP", - ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "B/LB", - ClinicalSignificance_sub == "Uncertain significance" ~ "VUS", + ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "Pathogenic/Likely pathogenic", + ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "Benign/Likely benign", + ClinicalSignificance_sub == "Uncertain significance" ~ "Uncertain significance", TRUE ~ NA_character_ )) %>% count(VariationID, ClinSig) %>% @@ -321,9 +339,9 @@ consensus_calls <- conflicting_variants %>% # Extract variants with consensus calls variants_consensus_call <- conflicting_variants %>% dplyr::mutate(ClinSig = case_when( - ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "P/LP", - ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "B/LB", - ClinicalSignificance_sub == "Uncertain significance" ~ "VUS", + ClinicalSignificance_sub %in% c("Pathogenic", "Likely pathogenic") ~ "Pathogenic/Likely pathogenic", + ClinicalSignificance_sub %in% c("Benign", "Likely benign") ~ "Benign/Likely benign", + ClinicalSignificance_sub == "Uncertain significance" ~ "Uncertain significance", TRUE ~ NA_character_ )) %>% dplyr::filter(glue::glue("{VariationID}-{ClinSig}") %in% glue::glue("{consensus_calls$VariationID}-{consensus_calls$ClinSig}")) %>% @@ -331,7 +349,7 @@ variants_consensus_call <- conflicting_variants %>% distinct(VariationID, .keep_all = T) %>% arrange(VariationID) %>% dplyr::mutate(ReviewStatus_var = glue::glue("{ReviewStatus_var}, consensus call taken")) %>% - dplyr::select(-ClinSig) + dplyr::rename(ClinSig_report = ClinSig) # Resolve conflicts in remaining variants by taking call at date last evaluated variants_conflicts_latest <- conflicting_variants %>% @@ -341,13 +359,16 @@ variants_conflicts_latest <- conflicting_variants %>% dplyr::arrange(desc(mdy(LastEvaluated_sub))) %>% dplyr::slice_head(n = 1) %>% ungroup() %>% - dplyr::mutate(ReviewStatus_var = glue::glue("{ReviewStatus_var}, latest date evaluated call taken")) + dplyr::mutate( + ClinSig_report = ClinicalSignificance_sub, + ReviewStatus_var = glue::glue("{ReviewStatus_var}, latest date evaluated call taken") + ) # create final df of all resolved submissions, and take Clinical Significance of submission as ClinSig -submission_final_df <- variants_no_conflicts %>% +submission_final_df <- variants_resolved %>% bind_rows(variants_consensus_call, variants_conflicts_latest) %>% dplyr::mutate( - ClinicalSignificance = ClinicalSignificance_sub, + ClinicalSignificance = ClinSig_report, ReviewStatus = ReviewStatus_var, SubmittedPhenotypeInfo = case_when( SubmittedPhenotypeInfo == "Not Provided" ~ NA_character_, @@ -364,7 +385,7 @@ submission_final_df <- variants_no_conflicts %>% "LastEvaluated", "Description", "SubmittedPhenotypeInfo", "ReportedPhenotypeInfo", "ReviewStatus", "SubmittedGeneSymbol", "GeneSymbol", "vcf_id", - "Origin", "OriginSimple" + "Origin", "OriginSimple", "Stars" ))) # extract all variants with conflicting interpretations @@ -387,7 +408,9 @@ submission_final_df <- submission_final_df %>% TRUE ~ "None" )) +table(submission_final_df$ClinicalSignificance) + write_tsv( submission_final_df, - file.path(results_dir, "ClinVar-selected-submissions.tsv") + file.path(results_dir, "resolved-clinvar-interpretations.tsv") )