diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a11b5470..2183735d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,6 +14,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: 3.9 + - name: Set up miniconda + uses: conda-incubator/setup-miniconda@v3 + with: + auto-update-conda: true + python-version: 3.9 - name: Install dependencies with conda run: | sudo chown -R $UID $CONDA && source $CONDA/etc/profile.d/conda.sh && conda env update --file environment.yml diff --git a/R/scripts/plot_gubbins.R b/R/scripts/plot_gubbins.R index fe58d68b..9194464d 100755 --- a/R/scripts/plot_gubbins.R +++ b/R/scripts/plot_gubbins.R @@ -533,7 +533,7 @@ plot_gubbins_recombination <- function(gubbins_rec,gubbins_tree, start_pos = NA, tidyr::unnest_longer(gene, values_to = "Taxa") %>% dplyr::mutate(Taxa = factor(Taxa, - levels = rev(get_taxa_name(gubbins_tree)))) %>% + levels = rev(ggtree::get_taxa_name(gubbins_tree)))) %>% dplyr::mutate(length = end - start + 1) %>% dplyr::arrange(rev(length)) %>% dplyr::select(-length) @@ -543,6 +543,16 @@ plot_gubbins_recombination <- function(gubbins_rec,gubbins_tree, start_pos = NA, trim_start(start_pos) %>% trim_end(end_pos) + # Get the number of taxa for selecting the line width + n_taxa <- length(ggtree::get_taxa_name(gubbins_tree)) + rec_linewidth <- + dplyr::case_when( + n_taxa < 10 ~ 5, + n_taxa < 50 ~ 2, + n_taxa < 100 ~ 1.5, + TRUE ~ 1 + ) + # Plot recombination rec_plot <- ggplot(gubbins_rec, @@ -551,7 +561,8 @@ plot_gubbins_recombination <- function(gubbins_rec,gubbins_tree, start_pos = NA, y = Taxa, yend = Taxa, colour = Colour)) + - geom_segment(alpha = 0.25) + + geom_segment(alpha = 0.5, + linewidth = rec_linewidth) + scale_colour_manual(values = c("red" = "red", "blue" = "blue")) + scale_y_discrete(drop = FALSE) + diff --git a/VERSION b/VERSION index fa7adc7a..2f4b6075 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3.5 +3.4 diff --git a/configure.ac b/configure.ac index eeae5ae5..ff0fa07e 100644 --- a/configure.ac +++ b/configure.ac @@ -31,6 +31,7 @@ AC_PROG_CXX # Checks for pthread AX_PTHREAD +LDFLAGS="$LDFLAGS -pthread" # Checks for code coverage AX_CODE_COVERAGE diff --git a/docs/gubbins_manual.md b/docs/gubbins_manual.md index 2d12c8cd..e79b5076 100644 --- a/docs/gubbins_manual.md +++ b/docs/gubbins_manual.md @@ -279,6 +279,14 @@ To get summary statistics (e.g. ***r***/***m***) and subtrees for distinct clade extract_gubbins_clade_statistics.py --clades [file designating isolates to clades] --gff out.recombination_predictions.gff --snps out.branch_base_reconstruction.embl --tree out.final_tree.tre --out tree_anaysis ``` +To extract the recombinant sequences identified by Gubbins from the alignment: + +``` +extract_recombinant_sequences.py [-h] --aln [input alignment file] --gff out.recombination_predictions.gff --out-dir OUT_DIR [--start START] [--end END] [--terminal-only] +``` + +Note that currently this only identifies the unique alleles at the recombinant loci from the sequences in the alignment - it is not using the reconstructed sequences used to infer the recombinations on internal branches. This script will hopefully be updated to offer greater functionality in the future. + ## Examples Two example alignments can be downloaded from http://nickjcroucher.github.io/gubbins/: diff --git a/python/gubbins/common.py b/python/gubbins/common.py index f48ed1bc..420cc646 100644 --- a/python/gubbins/common.py +++ b/python/gubbins/common.py @@ -453,7 +453,7 @@ def parse_and_run(input_args, program_description=""): gubbins_command = create_gubbins_command( gubbins_exec, gaps_alignment_filename, gaps_vcf_filename, current_tree_name, input_args.alignment_filename, input_args.min_snps, input_args.min_window_size, input_args.max_window_size, - input_args.p_value, input_args.trimming_ratio, input_args.extensive_search) + input_args.p_value, input_args.trimming_ratio, input_args.extensive_search, input_args.threads) printer.print(["\nRunning Gubbins to detect recombinations...", gubbins_command]) try: subprocess.check_call(gubbins_command, shell=True) @@ -798,10 +798,10 @@ def select_best_models(snp_alignment_filename,basename,current_tree_builder,inpu def create_gubbins_command(gubbins_exec, alignment_filename, vcf_filename, current_tree_name, original_alignment_filename, min_snps, min_window_size, max_window_size, - p_value, trimming_ratio, extensive_search): + p_value, trimming_ratio, extensive_search, num_threads): command = [gubbins_exec, "-r", "-v", vcf_filename, "-a", str(min_window_size), "-b", str(max_window_size), "-f", original_alignment_filename, "-t", current_tree_name, - "-m", str(min_snps), "-p", str(p_value), "-i", str(trimming_ratio)] + "-m", str(min_snps), "-p", str(p_value), "-i", str(trimming_ratio), "-n", str(num_threads)] if extensive_search: command.append("-x") command.append(alignment_filename) diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index e4e5a1f1..4d4a2302 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -7,6 +7,7 @@ import unittest import os +import shutil import subprocess import hashlib import glob @@ -144,6 +145,17 @@ def test_recombination_counting_per_gene(self): assert self.md5_check(out_tab, check_tab) os.remove(out_tab) + # Test clade file extraction script + def test_extract_recombination_sequences(self): + multiple_aln = os.path.join(data_dir, "multiple_recombinations.aln") + multiple_gff = os.path.join(data_dir, "multiple_recombinations_gubbins.recombination_predictions.gff") + out_dir = os.path.join(data_dir, "test_loci") + rec_count_cmd = "extract_recombinant_sequences.py --aln " + multiple_aln + " --gff " + multiple_gff + " --out-dir " + out_dir + subprocess.check_call(rec_count_cmd, shell=True) + file_count = int(subprocess.run('ls -1 ' + out_dir + ' | wc -l', shell = True, stdout=subprocess.PIPE).stdout.decode('utf-8').rstrip()) + assert file_count > 1 + shutil.rmtree(out_dir) + # Test plotting script (an R exception) def test_recombination_counting_per_gene(self): tree_input = os.path.join(data_dir, "expected_RAxML_result.multiple_recombinations.iteration_5") diff --git a/python/gubbins/tests/test_utils.py b/python/gubbins/tests/test_utils.py index 2ee42b90..c3c53820 100644 --- a/python/gubbins/tests/test_utils.py +++ b/python/gubbins/tests/test_utils.py @@ -20,8 +20,8 @@ class TestUtilities(unittest.TestCase): def test_gubbins_command(self): - assert common.create_gubbins_command('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 5, 10, 200, 0.05, 1.0, 0) \ - == 'AAA -r -v CCC -a 10 -b 200 -f EEE -t DDD -m 5 -p 0.05 -i 1.0 BBB' + assert common.create_gubbins_command('AAA', 'BBB', 'CCC', 'DDD', 'EEE', 5, 10, 200, 0.05, 1.0, 0, 1) \ + == 'AAA -r -v CCC -a 10 -b 200 -f EEE -t DDD -m 5 -p 0.05 -i 1.0 -n 1 BBB' def test_translation_of_filenames_to_final_filenames(self): assert common.translation_of_filenames_to_final_filenames('AAA', 'test') == { diff --git a/python/gubbins/treebuilders.py b/python/gubbins/treebuilders.py index 5f531a53..203ade8c 100644 --- a/python/gubbins/treebuilders.py +++ b/python/gubbins/treebuilders.py @@ -289,7 +289,7 @@ def __init__(self, threads: 1, model: str, bootstrap = 0, seed = None, internal_ self.citation = "https://doi.org/10.1093/molbev/msaa015" # Set parallelisation - command.extend(["-nt", str(self.threads)]) + command.extend(["-T", str(self.threads)]) # Add flags command.extend(["-safe","-redo"]) diff --git a/python/gubbins/utils.py b/python/gubbins/utils.py index dc19a8fe..5eff6e00 100644 --- a/python/gubbins/utils.py +++ b/python/gubbins/utils.py @@ -121,7 +121,8 @@ def choose_executable_based_on_processor(list_of_executables: list): for executable in list_of_executables: if cpu_info and 'AVX2' in executable and 'avx2' in flags and which(executable): break - elif cpu_info and 'AVX' in executable and 'avx' in flags and which(executable): + elif cpu_info and ('AVX' in executable and 'AVX2' not in executable)\ + and ('avx' in flags and 'avx2' not in flags) and which(executable): break elif cpu_info and 'SSE3' in executable and 'sse3' in flags and which(executable): break diff --git a/python/scripts/extract_recombinant_sequences.py b/python/scripts/extract_recombinant_sequences.py new file mode 100755 index 00000000..6fee96a9 --- /dev/null +++ b/python/scripts/extract_recombinant_sequences.py @@ -0,0 +1,101 @@ +#! python + +# encoding: utf-8 +# Wellcome Trust Sanger Institute and Imperial College London +# Copyright (C) 2020 Wellcome Trust Sanger Institute and Imperial College London +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# + +# Generic imports +import os +import sys +import argparse +import re +import math + +# Biopython imports +from Bio import AlignIO +from Bio import Phylo +from Bio import SeqIO +from Bio.Align import MultipleSeqAlignment +from Bio.Seq import Seq + +# command line parsing +def get_options(): + + parser = argparse.ArgumentParser(description='Extract all the unique alleles at recombinant loci') + + # input options + parser.add_argument('--aln', + help = 'Input alignment (FASTA format)', + required = True) + parser.add_argument('--gff', + help = 'GFF of recombinant regions detected by Gubbins', + required = True) + parser.add_argument('--out-dir', + help = 'Output directory', + required = True) + parser.add_argument('--start', + help = 'Start of region of interest', + default = 1, + required = False) + parser.add_argument('--end', + help = 'End of region of interest', + default = math.inf, + required = False) + parser.add_argument('--terminal-only', + help = 'Only extract recombinations on terminal branches', + default = False, + action = 'store_true') + + return parser.parse_args() + +# main code +if __name__ == "__main__": + + # Get command line options + args = get_options() + + # Create output directory + if not os.path.isdir(args.out_dir): + os.mkdir(args.out_dir) + + # Read recombinant regions from GFF + rec_start = [] + rec_end = [] + with open(args.gff,'r') as gff_file: + for line in gff_file.readlines(): + if not line.startswith('##'): + # Get coordinates + info = line.rstrip().split('\t') + taxon_pattern = re.compile('taxa="([^"]*)"') + taxon_set = set(taxon_pattern.search(info[8]).group(1).split()) + if (len(taxon_set) == 1 or not args.terminal_only): + rec_start.append(int(info[3])) + rec_end.append(int(info[4])) + + # Read in alignment and identify recombinations + alignment = AlignIO.read(args.aln,'fasta') + for (start,end) in zip(rec_start,rec_end): + if start >= args.start and end <= args.end: + out_fn = 'locus_' + str(start) + '_' + str(end) + '.aln' + with open(os.path.join(args.out_dir,out_fn),'w') as out_file: + seen_seqs = [] + rec_locus_alignment = alignment[:,(start-1):end] + for taxon in rec_locus_alignment: + if taxon.seq not in seen_seqs: + out_file.write('>' + taxon.id + '\n' + str(taxon.seq) + '\n') + seen_seqs.append(taxon.seq) diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py index bd3b1fcf..9dd135aa 100755 --- a/python/scripts/generate_ska_alignment.py +++ b/python/scripts/generate_ska_alignment.py @@ -118,7 +118,7 @@ def ska_map_sequences(seq, k = None, ref = None): ' --out ' + args.out + '.csv', shell = True) - sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n") + sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\nPlease cite https://doi.org/10.1101/2024.03.25.586631\n") # Clean up if not args.no_cleanup: diff --git a/python/setup.py b/python/setup.py index ad07a089..e6e8cd5c 100755 --- a/python/setup.py +++ b/python/setup.py @@ -22,6 +22,7 @@ 'scripts/gubbins_alignment_checker.py', 'scripts/generate_files_for_clade_analysis.py', 'scripts/count_recombinations_per_gene.py', + 'scripts/extract_recombinant_sequences.py', '../R/scripts/plot_gubbins.R' ], tests_require=[ diff --git a/src/Newickform.c b/src/Newickform.c index 38a6a11c..825b4078 100644 --- a/src/Newickform.c +++ b/src/Newickform.c @@ -19,6 +19,8 @@ #define __NEWICKFORM_C__ +#include +#include "stdio.h" #include "seqUtil.h" #include "Newickform.h" #include "branch_sequences.h" @@ -28,7 +30,242 @@ #define STR_OUT "out" -newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns,int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag) +// Define thread function for inserting gaps into sequences +void* gaps_threadFunction(void* arg) { + + // Extract thread data + struct gaps_ThreadData* data = (struct gaps_ThreadData*)arg; + + if (data->num_nodes_to_process > -1) + { + + for (int node_num_index = data->start_node; node_num_index < data->start_node+data->num_nodes_to_process; ++node_num_index) + { + int node_index = data->node_indices[node_num_index]; + int parent_node_index = data->parents[node_index]; + fill_in_recombinations_with_gaps(data->nodeArray, + node_index, + parent_node_index, + data->recombinations_array, + data->num_recombinations_array, + data->current_total_snps_array, + data->num_blocks_array, + data->length_of_original_genome, + data->snp_locations, + data->number_of_snps); + } + + } + + // Exit thread + pthread_exit(NULL); +} + +// Define thread function for identifying recombinations +void* rec_threadFunction(void* arg) { + + // Extract thread data + struct rec_ThreadData* data = (struct rec_ThreadData*)arg; + + if (data->num_nodes_to_process > -1) + { + for (int node_num_index = data->start_node; node_num_index < data->start_node+data->num_nodes_to_process; ++node_num_index) + { + int node_index = data->node_indices[node_num_index]; + // Generate branch sequences and identify recombinations + generate_branch_sequences(data->nodeArray[node_index], + data->vcf_file_pointer, + data->snp_locations, + data->number_of_snps, + data->column_names, + data->number_of_columns, + data->length_of_original_genome, + data->block_file_pointer, + data->gff_file_pointer, + data->min_snps, + data->branch_snps_file_pointer, + data->window_min, + data->window_max, + data->uncorrected_p_value, + data->trimming_ratio, + data->extensive_search_flag, + node_index); + } + } + + // Exit thread + pthread_exit(NULL); +} + +// Function to extract nodes relevant for each depth +void get_job_nodes(newick_node** jobNodeArray, newick_node** nodeArray, int* node_depths, int depth, int num_nodes) +{ + int j = 0; + for (int i = 0; i < num_nodes; ++i) + { + if (node_depths[i] == depth) + { + jobNodeArray[j] = nodeArray[i]; // TO DO convert to pointer + ++j; + } + } +} + +// Function to identify nodes relevant for each depth +void get_job_node_indices(int* jobNodeIndexArray, newick_node** nodeArray, int* node_depths, int depth, int num_nodes) +{ + int j = 0; + for (int i = 0; i < num_nodes; ++i) + { + if (node_depths[i] == depth) + { + jobNodeIndexArray[j] = i; + ++j; + } + } +} + +// Function to count number of jobs to run at a particular depth +int get_job_counts(int *node_depths, int depth, int num_nodes) +{ + int count = 0; + for (int i = 0; i < num_nodes; ++i) + { + if (node_depths[i] == depth) + { + count++; + } + } + return count; +} + +// Function to create an array of nodes +void fill_nodeArray(newick_node *root,newick_node** nodeArray, int num_nodes) +{ + if (root->childNum != 0) + { + newick_child *child; + child = root->child; + while (child != NULL) + { + fill_nodeArray(child->node,nodeArray, num_nodes); + child = child->next; + } + } + for (int i = 0; i < num_nodes; ++i) + { + if (nodeArray[i]->taxon == NULL) + { + nodeArray[i] = root; + break; + } + } +} + +// Function to count the total number of tree nodes +int count_tree_nodes(newick_node* root) { + if (root == NULL) return 0; + int count = 1; // Count the root node + // Recursively count nodes in the child subtree + if (root->childNum != 0) + { + newick_child *child; + child = root->child; + while (child != NULL) + { + count += count_tree_nodes(child->node); + child = child->next; + } + } + return count; +} + +// Function to find the maximum distance from an internal node to the tips +int max_distance_to_tips(newick_node *root) { + + // Initialize the maximum distance to 0 + int max_distance = 0; + + // Distance is non-zero if not leaf node + if (root->childNum != 0) + { + // Traverse each child of the internal node + newick_child* child = root->child; + while (child != NULL) + { + // Recursively find the maximum distance from the child to the tips + int child_distance = max_distance_to_tips(child->node); + // Add one for the distance from the child to the parent + child_distance++; + // Update the maximum distance if the distance from this child is greater + if (child_distance > max_distance) { + max_distance = child_distance; + } + child = child->next; + } + } + + return max_distance; +} + +// Recursive function to traverse the tree and populate the parents list +void populate_parents(newick_node *node, newick_node** nodeArray, int * parents, int num_nodes) { + + // Get index of parent + int parent_index; + for (int i = 0; i < num_nodes; ++i) + { + if (nodeArray[i]->taxon == node->taxon) + { + parent_index = i; + break; + } + } + + // Get indices of children + if (node->child != NULL) { + newick_child *child = node->child; + while (child != NULL) { + for (int j = 0; j < num_nodes; ++j) + { + if (nodeArray[j]->taxon == child->node->taxon) + { + parents[j] = parent_index; + break; + } + } + child = child->next; + } + } + + // Recursively traverse child nodes + if (node->child != NULL) { + newick_child *child = node->child; + while (child != NULL) { + populate_parents(child->node, nodeArray, parents, num_nodes); + child = child->next; + } + } +} + +// Function to initialize the parents list and call the recursive function +int * get_parents(newick_node *root, newick_node** nodeArray, int num_nodes) { + + // Initialise parent node array + int * parents = calloc(num_nodes,sizeof(int)); + + // Initialize all elements to NULL + for (int i = 0; i < num_nodes; i++) { + parents[i] = -1; + } + + // Populate the parents list recursively + populate_parents(root, nodeArray, parents, num_nodes); + + return parents; +} + +newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns,int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads) { int iLen, iMaxLen; char *pcTreeStr; @@ -36,7 +273,7 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp char *pcOutputFile; char acStrArray[256]; newick_node *root; - char *returnchar; + char *returnchar; FILE *f; @@ -71,54 +308,193 @@ newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp root = parseTree(pcTreeStr); // output tab file - FILE * block_file_pointer; - char block_file_name[MAX_FILENAME_SIZE] = {""}; - char block_file_extension[5]= {".tab"}; + FILE * block_file_pointer; + char block_file_name[MAX_FILENAME_SIZE] = {""}; + char block_file_extension[5]= {".tab"}; memcpy(block_file_name, filename, size_of_string(filename) +1); concat_strings_created_with_malloc(block_file_name,block_file_extension); block_file_pointer = fopen(block_file_name, "w"); // output tab file - FILE * branch_snps_file_pointer; - char branch_snps_file_name[MAX_FILENAME_SIZE]= {""}; - char branchtab_extension[18]= {".branch_snps.tab"}; + FILE * branch_snps_file_pointer; + char branch_snps_file_name[MAX_FILENAME_SIZE]= {""}; + char branchtab_extension[18]= {".branch_snps.tab"}; memcpy(branch_snps_file_name, filename, size_of_string(filename) +1); concat_strings_created_with_malloc(branch_snps_file_name,branchtab_extension); branch_snps_file_pointer = fopen(branch_snps_file_name, "w"); // output gff file FILE * gff_file_pointer; - char gff_file_name[MAX_FILENAME_SIZE]= {""}; - memcpy(gff_file_name, filename, size_of_string(filename) +1); - char gff_extension[5]= {".gff"}; + char gff_file_name[MAX_FILENAME_SIZE]= {""}; + memcpy(gff_file_name, filename, size_of_string(filename) +1); + char gff_extension[5]= {".gff"}; concat_strings_created_with_malloc(gff_file_name,gff_extension); gff_file_pointer = fopen(gff_file_name, "w"); print_gff_header(gff_file_pointer,length_of_original_genome); - char * root_sequence = NULL; - - carry_unambiguous_gaps_up_tree(root); - root_sequence = generate_branch_sequences(root, - vcf_file_pointer, - snp_locations, - number_of_snps, - column_names, - number_of_columns, - root_sequence, - length_of_original_genome, - block_file_pointer, - gff_file_pointer, - min_snps, - branch_snps_file_pointer, - window_min, - window_max, - uncorrected_p_value, - trimming_ratio, - extensive_search_flag); - free(root_sequence); - int * parent_recombinations = NULL; - fill_in_recombinations_with_gaps(root, parent_recombinations, 0, 0,0,root->block_coordinates,length_of_original_genome,snp_locations,number_of_snps); + // iterate through tree to get list of nodes + int num_nodes = 0; + newick_node * current_node = root; + num_nodes = count_tree_nodes(current_node); + newick_node** nodeArray = malloc(num_nodes * sizeof(newick_node*)); + for (int i = 0; i < num_nodes; ++i) + { + nodeArray[i] = (newick_node*)seqMalloc(sizeof(newick_node)); + } + fill_nodeArray(current_node,nodeArray,num_nodes); + + // get parent of each node + int * parents = get_parents(root, nodeArray, num_nodes); + + // get depths of each node from tips + int max_depth = max_distance_to_tips(root); + int *node_depths = malloc(num_nodes * sizeof(int)); + for (int i = 0; i < num_nodes; ++i) + { + node_depths[i] = max_distance_to_tips(nodeArray[i]); + } + + // Initiate multithreading + pthread_t threads[num_threads]; + struct rec_ThreadData rec_ThreadData[num_threads]; + struct gaps_ThreadData gaps_ThreadData[num_threads]; + + // iterate through depths and identify batches of analyses to be run + for (int depth = 0; depth <= max_depth; ++depth) + { + + // Identify number of nodes at the current depth + int num_jobs = get_job_counts(node_depths,depth,num_nodes); + int * jobNodeIndexArray = malloc(num_jobs * sizeof(int)); + get_job_node_indices(jobNodeIndexArray,nodeArray,node_depths,depth,num_nodes); + + // Divide jobNodeArray among threads + int numJobsPerThread = num_jobs / num_threads; + int remainder = num_jobs % num_threads; + + // Create and execute threads + for (int i = 0; i < num_threads; ++i) { + + // Calculate start and end indices for current thread + int startIndex = i * numJobsPerThread + (i < remainder ? i : remainder); + int endIndex = startIndex + numJobsPerThread + (i < remainder ? 1 : 0) - 1; + + // Set thread data + rec_ThreadData[i].node_indices = jobNodeIndexArray; + rec_ThreadData[i].nodeArray = nodeArray; + rec_ThreadData[i].start_node = startIndex; + rec_ThreadData[i].num_nodes_to_process = endIndex - startIndex + 1; // Number of nodes for this thread + rec_ThreadData[i].vcf_file_pointer = vcf_file_pointer; + rec_ThreadData[i].snp_locations = snp_locations; + rec_ThreadData[i].number_of_snps = number_of_snps; + rec_ThreadData[i].column_names = column_names; + rec_ThreadData[i].number_of_columns = number_of_columns; + rec_ThreadData[i].length_of_original_genome = length_of_original_genome; + rec_ThreadData[i].block_file_pointer = block_file_pointer; + rec_ThreadData[i].gff_file_pointer = gff_file_pointer; + rec_ThreadData[i].min_snps = min_snps; + rec_ThreadData[i].branch_snps_file_pointer = branch_snps_file_pointer; + rec_ThreadData[i].window_min = window_min; + rec_ThreadData[i].window_max = window_max; + rec_ThreadData[i].uncorrected_p_value = uncorrected_p_value; + rec_ThreadData[i].trimming_ratio = trimming_ratio; + rec_ThreadData[i].extensive_search_flag = extensive_search_flag; + rec_ThreadData[i].thread_index = i; + + // Create thread + if (pthread_create(&threads[i], NULL, rec_threadFunction, (void*)&rec_ThreadData[i]) != 0) { + perror("pthread_create"); + exit(EXIT_FAILURE); + } + + } + + // Join threads + for (int i = 0; i < num_threads; ++i) { + pthread_join(threads[i], NULL); + } + + // Free jobNodeArray + free(jobNodeIndexArray); + + } + + // Define data structures needed to record statistics and mask recombined sequence + int ** recombinations_array = calloc(num_nodes,sizeof(int*)); + for (int i = 0; i < num_nodes; ++i) { + recombinations_array[i] = NULL; + } + int * num_recombinations_array = calloc(num_nodes,sizeof(int)); + int * current_total_snps_array = calloc(num_nodes,sizeof(int)); + int * num_blocks_array = calloc(num_nodes,sizeof(int)); + for (int i = 0; i < num_nodes; ++i) + { + num_recombinations_array[i] = 0; + current_total_snps_array[i] = 0; + num_blocks_array[i] = 0; + } +// int * parent_recombinations = NULL; + + // Iterate from root to tips to record statistics and mask recombined sequence + for (int depth = max_depth; depth >= 0; --depth) { + + // Identify number of nodes at the current depth + int num_jobs = get_job_counts(node_depths,depth,num_nodes); + int * jobNodeIndexArray = malloc(num_jobs * sizeof(int)); + get_job_node_indices(jobNodeIndexArray,nodeArray,node_depths,depth,num_nodes); + + // Divide jobNodeArray among threads + int numJobsPerThread = num_jobs / num_threads; + int remainder = num_jobs % num_threads; + + // Create and execute threads + for (int i = 0; i < num_threads; ++i) { + + // Calculate start and end indices for current thread + int startIndex = i * numJobsPerThread + (i < remainder ? i : remainder); + int endIndex = startIndex + numJobsPerThread + (i < remainder ? 1 : 0) - 1; + + // Set thread data + gaps_ThreadData[i].node_indices = jobNodeIndexArray; + gaps_ThreadData[i].start_node = startIndex; + gaps_ThreadData[i].num_nodes_to_process = endIndex - startIndex + 1; // Number of nodes for this thread + gaps_ThreadData[i].nodeArray = nodeArray; + gaps_ThreadData[i].parents = parents; + gaps_ThreadData[i].recombinations_array = recombinations_array; + gaps_ThreadData[i].num_recombinations_array = num_recombinations_array; + gaps_ThreadData[i].current_total_snps_array = current_total_snps_array; + gaps_ThreadData[i].num_blocks_array = num_blocks_array; + gaps_ThreadData[i].length_of_original_genome = length_of_original_genome; + gaps_ThreadData[i].snp_locations = snp_locations; + gaps_ThreadData[i].number_of_snps = number_of_snps; + gaps_ThreadData[i].thread_index = i; + + // Create thread + if (pthread_create(&threads[i], NULL, gaps_threadFunction, (void*)&gaps_ThreadData[i]) != 0) { + perror("pthread_create"); + exit(EXIT_FAILURE); + } + + } + + // Join threads + for (int i = 0; i < num_threads; ++i) { + pthread_join(threads[i], NULL); + } + } + + // Free gaps arrays + free(num_recombinations_array); + free(current_total_snps_array); + free(num_blocks_array); + free(recombinations_array); + + // Free general arrays + free(nodeArray); + free(node_depths); + free(parents); + fclose(block_file_pointer); fclose(gff_file_pointer); fclose(branch_snps_file_pointer); diff --git a/src/Newickform.h b/src/Newickform.h index f5818c0a..2e32c09a 100644 --- a/src/Newickform.h +++ b/src/Newickform.h @@ -20,7 +20,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ - typedef struct newick_child { struct newick_node *node; @@ -47,17 +46,70 @@ typedef struct newick_node struct newick_node *parent; } newick_node; +// Define the structure to hold thread arguments +struct rec_ThreadData { + int * node_indices; // Indices of nodes to be processed by all threads + newick_node ** nodeArray; // Pointer to the array of node sequences + int start_node; // Index of starting node for this thread + int num_nodes_to_process; // Number of nodes to process by this thread + char** node_sequences; // Pointer to the array of node sequences + char** node_names; // Pointer to the array of node names + FILE* vcf_file_pointer; // Pointer to the VCF file + int* snp_locations; // Pointer to the array of SNP locations + int number_of_snps; // Number of SNPs + char** column_names; // Pointer to the array of column names + int number_of_columns; // Number of columns + int length_of_original_genome; // Length of the original genome + int num_stored_nodes; // Number of stored nodes + FILE* block_file_pointer; // Pointer to the block file + FILE* gff_file_pointer; // Pointer to the GFF file + int min_snps; // Minimum number of SNPs + FILE* branch_snps_file_pointer; // Pointer to the branch SNPs file + int window_min; // Minimum window size + int window_max; // Maximum window size + float uncorrected_p_value; // Uncorrected p-value + float trimming_ratio; // Trimming ratio + int extensive_search_flag; // Extensive search flag + int thread_index; // Thread index +}; + +// Define the structure to hold thread arguments +struct gaps_ThreadData { + int * node_indices; // Nodes to be processed by this thread + int start_node; // Index of starting node for this thread + int num_nodes_to_process; // Number of nodes to process by this thread + newick_node ** nodeArray; // Pointer to the array of node sequences + int * parents; // Indices of the parents of the nodes + int ** recombinations_array; // Pointer to the list of recombination coordinates + int * num_recombinations_array; // Pointer to counts of recombinations + int * current_total_snps_array; // Number of SNPs on each branch + int * num_blocks_array; // Number of recombination on each branch + int length_of_original_genome; // Length of the original genome + int * snp_locations; // List of SNP locations + int number_of_snps; // Number of SNP sites in the alignment + int thread_index; // Thread index +}; + #define MAX_FILENAME_SIZE 1024 #ifdef __NEWICKFORM_C__ newick_node* parseTree(char *str); -newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag); +newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads); void print_tree(newick_node *root, FILE * outputfile); char* strip_quotes(char *taxon); #else extern newick_node* parseTree(char *str); -extern newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag); +extern newick_node* build_newick_tree(char * filename, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, int length_of_original_genome,int min_snps, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads); +void* rec_threadFunction(void* arg); extern void print_tree(newick_node *root, FILE * outputfile); +void fill_nodeArray(newick_node *root, newick_node** nodeArray); +int count_tree_nodes(newick_node* root); +void get_job_nodes(newick_node** jobNodeArray,newick_node** nodeArray,int* node_depths,int depth,int num_nodes); +void get_job_node_indices(int* jobNodeIndexArray, newick_node** nodeArray, int* node_depths, int depth, int num_nodes); +void get_job_counts(int *node_depths, int depth, int num_nodes); +int * get_parents(newick_node *root, newick_node** nodeArray, int num_nodes); +void populate_parents(newick_node *node, newick_node** nodeArray, int * parents, int num_nodes); +void* gaps_threadFunction(void* arg); extern char* strip_quotes(char *taxon); #endif diff --git a/src/block_tab_file.c b/src/block_tab_file.c index 4eadfb51..409d9f9c 100644 --- a/src/block_tab_file.c +++ b/src/block_tab_file.c @@ -21,10 +21,18 @@ #include #include #include "block_tab_file.h" +#include + +// Define a mutex variable +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; void print_block_details(FILE * block_file_pointer, int start_coordinate, int end_coordinate, int number_of_snps, char * current_node_id, char * parent_node_id, char * taxon_names, int number_of_child_nodes, double neg_log_likelihood) { - fprintf(block_file_pointer, "FT misc_feature %d..%d\n", start_coordinate, end_coordinate); + + // Lock the mutex before accessing the file + pthread_mutex_lock(&mutex); + + fprintf(block_file_pointer, "FT misc_feature %d..%d\n", start_coordinate, end_coordinate); fprintf(block_file_pointer, "FT /node=\"%s->%s\"\n",parent_node_id,current_node_id); fprintf(block_file_pointer, "FT /neg_log_likelihood=\"%f\"\n",neg_log_likelihood); @@ -39,11 +47,19 @@ void print_block_details(FILE * block_file_pointer, int start_coordinate, int en fprintf(block_file_pointer, "FT /taxa=\"%s\"\n",taxon_names); fprintf(block_file_pointer, "FT /SNP_count=\"%d\"\n",number_of_snps); fflush(block_file_pointer); + + // Unlock the mutex after accessing the file + pthread_mutex_unlock(&mutex); + } void print_branch_snp_details(FILE * branch_snps_file_pointer, char * current_node_id, char * parent_node_id, int * branches_snp_sites, int number_of_branch_snps, char * branch_snp_sequence, char * branch_snp_ancestor_sequence,char * taxon_names) { + + // Lock the mutex before accessing the file + pthread_mutex_lock(&mutex); + int i = 0; for(i=0; i< number_of_branch_snps; i++) { @@ -55,4 +71,8 @@ void print_branch_snp_details(FILE * branch_snps_file_pointer, char * current_no fprintf(branch_snps_file_pointer, "FT /replace=\"%c\"\n",branch_snp_sequence[i]); fflush(branch_snps_file_pointer); } + + // Unlock the mutex after accessing the file + pthread_mutex_unlock(&mutex); + } diff --git a/src/branch_sequences.c b/src/branch_sequences.c index 8ffc88aa..b89a6552 100644 --- a/src/branch_sequences.c +++ b/src/branch_sequences.c @@ -33,9 +33,6 @@ #include "gff_file.h" #include "string_cat.h" -int node_counter = 0; - - // Order is not preserved. int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array) { @@ -108,115 +105,138 @@ int get_list_of_snp_indices_which_fall_in_downstream_recombinations(int ** curre // Go through the tree and build up the recombinations list from root to branch. Print out each sample name and a list of recombinations -void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps ) +void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** current_recombinations, int * num_recombinations, int * current_total_snps, int * num_blocks, int length_of_original_genome,int * snp_locations, int number_of_snps) { - newick_child *child; - int * current_recombinations; - int num_current_recombinations = 0 ; - char * child_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char)); - - current_recombinations = (int *) calloc((root->num_recombinations+1+parent_num_recombinations),sizeof(int)); - num_current_recombinations = copy_and_concat_integer_arrays(root->recombinations, - root->num_recombinations, - parent_recombinations, - parent_num_recombinations, - current_recombinations); + // Define the relevant tree nodes + newick_node * node = nodeArray[node_index]; + // overwrite the bases of snps with N's int i; int sequence_index; - sequence_index = find_sequence_index_from_sample_name(root->taxon); + sequence_index = find_sequence_index_from_sample_name(node->taxon); - set_number_of_recombinations_for_sample(root->taxon,root->num_recombinations); - set_number_of_snps_for_sample(root->taxon,root->number_of_snps); + // Set sample statistics for printing using information from node + set_number_of_recombinations_for_sample(sequence_index,node->num_recombinations); + set_number_of_snps_for_sample(sequence_index,node->number_of_snps); - get_sequence_for_sample_name(child_sequence, root->taxon); - int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(child_sequence, - length_of_original_genome, - current_block_coordinates, - num_blocks); - - set_genome_length_excluding_blocks_and_gaps_for_sample(root->taxon, - genome_length_excluding_blocks_and_gaps); - - int ** merged_block_coordinates; - merged_block_coordinates = (int **) calloc(3,sizeof(int *)); - merged_block_coordinates[0] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int )); - merged_block_coordinates[1] = (int*) calloc((num_blocks + root->number_of_blocks+1),sizeof(int )); - copy_and_concat_2d_integer_arrays(current_block_coordinates, - num_blocks, - root->block_coordinates, - root->number_of_blocks, - merged_block_coordinates - ); - - set_number_of_blocks_for_sample(root->taxon, root->number_of_blocks); - set_number_of_branch_bases_in_recombinations(root->taxon, - calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, - root->number_of_blocks, - child_sequence, - snp_locations, - current_total_snps) - ); - set_number_of_bases_in_recombinations(root->taxon, - calculate_number_of_bases_in_recombations_excluding_gaps(merged_block_coordinates, - (num_blocks + root->number_of_blocks), - child_sequence, - snp_locations, - current_total_snps) + // Set root-specific values + if (parent_node_index == -1) + { + for (int include_gaps = 0; include_gaps <= 1; ++include_gaps) { + set_number_of_bases_in_recombinations(sequence_index,0,include_gaps); + set_number_of_branch_bases_in_recombinations(sequence_index,0,include_gaps); + } + set_internal_node(1,sequence_index); + set_genome_length_excluding_blocks_and_gaps_for_sample(sequence_index, + length_of_original_genome); + } + else + { + + // Get parental sequence + newick_node * parent = nodeArray[parent_node_index]; + char * node_sequence = (char *) calloc((length_of_original_genome +1),sizeof(char)); + get_sequence_for_sample_index(node_sequence, sequence_index); + + // Calculate clonal frame remaining at the parental node + int ** current_block_coordinates = parent->block_coordinates; + int genome_length_excluding_blocks_and_gaps = calculate_genome_length_excluding_blocks_and_gaps(node_sequence, + length_of_original_genome, + current_block_coordinates, + num_blocks[parent_node_index]); + + set_genome_length_excluding_blocks_and_gaps_for_sample(sequence_index, + genome_length_excluding_blocks_and_gaps); + + // Identify the recombinations leading to this node from the root + current_recombinations[node_index] = (int *) calloc((node->num_recombinations+1+num_recombinations[parent_node_index]),sizeof(int)); + num_recombinations[node_index] = copy_and_concat_integer_arrays(node->recombinations, + node->num_recombinations, + current_recombinations[parent_node_index], + num_recombinations[parent_node_index], + current_recombinations[node_index]); + + // Merge block coordinates putting most recent blocks first + int ** merged_block_coordinates; + merged_block_coordinates = (int **) calloc(3,sizeof(int *)); + merged_block_coordinates[0] = (int*) calloc((num_blocks[parent_node_index] + node->number_of_blocks+1),sizeof(int )); + merged_block_coordinates[1] = (int*) calloc((num_blocks[parent_node_index] + node->number_of_blocks+1),sizeof(int )); + copy_and_concat_2d_integer_arrays(node->block_coordinates, + node->number_of_blocks, + current_block_coordinates, + num_blocks[parent_node_index], + merged_block_coordinates ); - free(child_sequence); - - for(i = 0; i < num_current_recombinations; i++) - { - update_sequence_base('N', sequence_index, current_recombinations[i]); - } - - // TODO: The stats for the number of snps in recombinations will need to be updated. - int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int)); - int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates, - (num_blocks + root->number_of_blocks), - snp_locations, - number_of_snps, - snps_in_recombinations); - for(i = 0; i < num_snps_in_recombinations; i++) - { - update_sequence_base('N', sequence_index, snps_in_recombinations[i]); - } - free(snps_in_recombinations); - - if (root->childNum > 0) - { - child = root->child; - set_internal_node(1,sequence_index); - - while (child != NULL) - { - fill_in_recombinations_with_gaps(child->node, - current_recombinations, - num_current_recombinations, - (current_total_snps + root->number_of_snps), - (num_blocks + root->number_of_blocks), - merged_block_coordinates, - length_of_original_genome, - snp_locations, - number_of_snps - ); - child = child->next; + + // Set the number of recombination blocks for the sample + set_number_of_blocks_for_sample(node->taxon, node->number_of_blocks); + + // Set number of branch bases in recombination by iterating through + // the first part of merged blocks (i.e. only blocks on the branch to this node) + for (int include_gaps = 0; include_gaps <= 1; ++include_gaps) { + set_number_of_branch_bases_in_recombinations(sequence_index, + calculate_number_of_bases_in_recombinations(merged_block_coordinates, + node->number_of_blocks, + node_sequence, + snp_locations, + number_of_snps, + include_gaps), + include_gaps + ); + } + + // Set number of total bases in recombination by iterating through + // all merged blocks leading to this node + for (int include_gaps = 0; include_gaps <= 1; ++include_gaps) { + set_number_of_bases_in_recombinations(sequence_index, + calculate_number_of_bases_in_recombinations(merged_block_coordinates, + (num_blocks[parent_node_index] + node->number_of_blocks), + node_sequence, + snp_locations, + number_of_snps, + include_gaps), + include_gaps + ); + } + free(node_sequence); + + for(i = 0; i < num_recombinations[node_index]; i++) + { + update_sequence_base('N', sequence_index, current_recombinations[node_index][i]); + } + + // TODO: The stats for the number of snps in recombinations will need to be updated. + int * snps_in_recombinations = (int *) calloc((number_of_snps +1),sizeof(int)); + int num_snps_in_recombinations = get_list_of_snp_indices_which_fall_in_downstream_recombinations(merged_block_coordinates, + (num_blocks[parent_node_index] + node->number_of_blocks), + snp_locations, + number_of_snps, + snps_in_recombinations); + + for(i = 0; i < num_snps_in_recombinations; i++) + { + update_sequence_base('N', sequence_index, snps_in_recombinations[i]); + } + free(snps_in_recombinations); + + if (node->childNum > 0) + { + set_internal_node(1,sequence_index); + // Update number of SNPs + current_total_snps[node_index] = current_total_snps[parent_node_index] + node->num_recombinations + node->number_of_snps; + num_blocks[node_index] = num_blocks[parent_node_index] + node->number_of_blocks; + nodeArray[node_index]->block_coordinates = merged_block_coordinates; + } + else + { + set_internal_node(0,sequence_index); + } + } - } - } - else - { - set_internal_node(0,sequence_index); - } - free(current_recombinations); - free(merged_block_coordinates[0]); - free(merged_block_coordinates[1]); - free(merged_block_coordinates); } -int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps) +int calculate_number_of_bases_in_recombinations(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations, int current_total_snps, int count_gaps) { int total_bases = 0; int current_block = 1; @@ -265,121 +285,91 @@ int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordi } for(start_block = 0; start_block < num_blocks; start_block++) { - if(block_coordinates[0][start_block] == -1 || block_coordinates[1][start_block] == -1) + if(block_coordinates[0][start_block] != -1 && block_coordinates[1][start_block] != -1) { - continue; + if (count_gaps > 0) { + total_bases += (1 + block_coordinates[1][start_block] - block_coordinates[0][start_block]); + } else { + total_bases += calculate_block_size_without_gaps(child_sequence, + snp_locations, + block_coordinates[0][start_block], + block_coordinates[1][start_block], + current_total_snps); + } } - - - total_bases += calculate_block_size_without_gaps(child_sequence, - snp_locations, - block_coordinates[0][start_block], - block_coordinates[1][start_block], - current_total_snps); + } return total_bases; } -void carry_unambiguous_gaps_up_tree(newick_node *root) +void generate_branch_sequences(newick_node *node, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps,FILE * branch_snps_file_pointer, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int thread_index) { - if(root->childNum > 0) - { - newick_child *child; - int parent_sequence_index = find_sequence_index_from_sample_name(root->taxon); - - child = root->child; - int child_sequence_indices[root->childNum]; - int child_counter = 0; - while (child != NULL) - { - child_sequence_indices[child_counter] = find_sequence_index_from_sample_name(child->node->taxon); - carry_unambiguous_gaps_up_tree(child->node); - child = child->next; - child_counter++; - } - - // compare the parent sequence to the each child sequence and update the gaps - fill_in_unambiguous_gaps_in_parent_from_children(parent_sequence_index, child_sequence_indices,child_counter); - //fill_in_unambiguous_bases_in_parent_from_children_where_parent_has_a_gap(parent_sequence_index, child_sequence_indices,child_counter); - } -} - -char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps,FILE * branch_snps_file_pointer, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag) -{ - newick_child *child; + + // Initialise common data structures + newick_child *child; int child_counter = 0; - int current_branch =0; int branch_genome_size = 0; - int number_of_branch_snps=0; - root->current_node_id = ++node_counter; - - if (root->childNum == 0) + int number_of_branch_snps = 0; + + // Get node sequence + int node_sequence_index = find_sequence_index_from_sample_name(node->taxon); + char * node_sequence = (char *) calloc((number_of_snps +1),sizeof(char)); + get_sequence_for_sample_index(node_sequence, node_sequence_index); + + // Get sequence reconstructed at node + branch_genome_size = calculate_size_of_genome_without_gaps(node_sequence, 0,number_of_snps, length_of_original_genome); + set_genome_length_without_gaps_for_sample(node_sequence_index,branch_genome_size); + + if (node->childNum == 0) { - leaf_sequence = (char *) calloc((number_of_snps +1),sizeof(char)); - get_sequence_for_sample_name(leaf_sequence, root->taxon); + + node->taxon_names = (char *) calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); + memcpy(node->taxon_names, node->taxon, size_of_string(node->taxon)+1); - root->taxon_names = (char *) calloc(MAX_SAMPLE_NAME_SIZE,sizeof(char)); - memcpy(root->taxon_names, root->taxon, size_of_string(root->taxon)+1); - - // Save some statistics about the sequence - branch_genome_size = calculate_size_of_genome_without_gaps(leaf_sequence, 0,number_of_snps, length_of_original_genome); - set_genome_length_without_gaps_for_sample(root->taxon,branch_genome_size); - - return leaf_sequence; } else { - child = root->child; - char * child_sequences[root->childNum]; - newick_node * child_nodes[root->childNum]; - root->taxon_names = (char *) calloc(MAX_SAMPLE_NAME_SIZE*number_of_columns,sizeof(char)); + + // Get child sequences + child = node->child; + int child_sequence_indices[node->childNum]; + char ** child_sequences = calloc((number_of_snps + 1) * node->childNum, sizeof(char*)); + // Allocate memory for each string in child_sequences + for (int i = 0; i < node->childNum; i++) { + child_sequences[i] = malloc((number_of_snps + 1) * sizeof(char)); + } - // generate pointers for each child seuqn + newick_node * child_nodes[node->childNum]; + node->taxon_names = (char *) calloc(MAX_SAMPLE_NAME_SIZE*number_of_columns,sizeof(char)); + + // generate pointers for each child sequence while (child != NULL) { - // recursion - child_sequences[child_counter] = generate_branch_sequences(child->node, - vcf_file_pointer, - snp_locations, - number_of_snps, - column_names, - number_of_columns, - child_sequences[child_counter], - length_of_original_genome, - block_file_pointer, - gff_file_pointer, - min_snps, - branch_snps_file_pointer, - window_min, - window_max, - uncorrected_p_value, - trimming_ratio, - extensive_search_flag); - child_nodes[child_counter] = child->node; + // Retrieve child sequences from alignment + child_sequence_indices[child_counter] = find_sequence_index_from_sample_name(child->node->taxon); + get_sequence_for_sample_index(child_sequences[child_counter], child_sequence_indices[child_counter]); + child_nodes[child_counter] = child->node; + + // Merge list of descendant taxa char delimiter_string[3] = {" "}; - concat_strings_created_with_malloc(root->taxon_names, delimiter_string); - concat_strings_created_with_malloc(root->taxon_names, child_nodes[child_counter]->taxon_names); - - child_counter++; - child = child->next; - } - - // For all bases update the parent sequence with N if all child sequences. - - leaf_sequence = (char *) calloc((number_of_snps +1),sizeof(char)); - // All child sequneces should be available use them to find the ancestor sequence - get_sequence_for_sample_name(leaf_sequence, root->taxon); - - branch_genome_size = calculate_size_of_genome_without_gaps(leaf_sequence, 0,number_of_snps, length_of_original_genome); - set_genome_length_without_gaps_for_sample(root->taxon,branch_genome_size); - - - - for(current_branch = 0 ; current_branch< (root->childNum); current_branch++) - { + concat_strings_created_with_malloc(node->taxon_names, delimiter_string); + concat_strings_created_with_malloc(node->taxon_names, child_nodes[child_counter]->taxon_names); + + // Move on to next child + child = child->next; + ++child_counter; + } + + // Fill in gaps from children + fill_in_unambiguous_gaps_in_parent_from_children(node_sequence_index, child_sequence_indices,child_counter); + + for (child_counter = 0; child_counter < node->childNum; ++child_counter) + { + + // Identify recombinations on descendant branches int * branches_snp_sites; branches_snp_sites = (int *) calloc((number_of_snps +1),sizeof(int)); char * branch_snp_sequence; @@ -387,27 +377,26 @@ char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * branch_snp_sequence = (char *) calloc((number_of_snps +1),sizeof(char)); branch_snp_ancestor_sequence = (char *) calloc((number_of_snps +1),sizeof(char)); - branch_genome_size = calculate_size_of_genome_without_gaps(child_sequences[current_branch], 0,number_of_snps, length_of_original_genome); - number_of_branch_snps = calculate_number_of_snps_excluding_gaps(leaf_sequence, child_sequences[current_branch], number_of_snps, branches_snp_sites, snp_locations,branch_snp_sequence,branch_snp_ancestor_sequence); + number_of_branch_snps = calculate_number_of_snps_excluding_gaps(node_sequence, child_sequences[child_counter], number_of_snps, branches_snp_sites, snp_locations,branch_snp_sequence,branch_snp_ancestor_sequence); - child_nodes[current_branch]->number_of_snps = number_of_branch_snps; - print_branch_snp_details(branch_snps_file_pointer, child_nodes[current_branch]->taxon,root->taxon, branches_snp_sites, number_of_branch_snps, branch_snp_sequence, branch_snp_ancestor_sequence,child_nodes[current_branch]->taxon_names); + child_nodes[child_counter]->number_of_snps = number_of_branch_snps; + print_branch_snp_details(branch_snps_file_pointer, child_nodes[child_counter]->taxon,node->taxon, branches_snp_sites, number_of_branch_snps, branch_snp_sequence, branch_snp_ancestor_sequence,child_nodes[child_counter]->taxon_names); - get_likelihood_for_windows(child_sequences[current_branch], + get_likelihood_for_windows(child_sequences[child_counter], number_of_snps, branches_snp_sites, branch_genome_size, number_of_branch_snps, snp_locations, - child_nodes[current_branch], + child_nodes[child_counter], block_file_pointer, - root, + node, branch_snp_sequence, gff_file_pointer, min_snps, length_of_original_genome, - leaf_sequence, + node_sequence, window_min, window_max, uncorrected_p_value, @@ -415,13 +404,23 @@ char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * extensive_search_flag); free(branch_snp_sequence); free(branch_snp_ancestor_sequence); - free(child_sequences[current_branch]); free(branches_snp_sites); - + } - - return leaf_sequence; + + // Store node sequence + free(node_sequence); + + // Deallocate memory for each string in child_sequences + for (int i = 0; i < node->childNum; i++) { + free(child_sequences[i]); + } + + // Deallocate memory for child_sequences + free(child_sequences); + } + } @@ -744,6 +743,8 @@ int get_blocks(int ** block_coordinates, int genome_size,int * snp_site_coords,i // create the pileup of the snps and their sphere of influence int snp_counter = 0; + int min_postion = genome_size; + int max_position = 0; for(snp_counter = 0; snp_counter < number_of_branch_snps; snp_counter++) { int j = 0; @@ -760,6 +761,11 @@ int get_blocks(int ** block_coordinates, int genome_size,int * snp_site_coords,i snp_sliding_window_counter = 0; } + if(snp_sliding_window_counter < min_postion) + { + min_postion = snp_sliding_window_counter; + } + // Upper bound of the window around a snp int max_snp_sliding_window_counter = snp_site_coords[snp_counter]+(window_size/2); max_snp_sliding_window_counter = extend_upper_part_of_window(snp_site_coords[snp_counter] + 1, @@ -771,6 +777,11 @@ int get_blocks(int ** block_coordinates, int genome_size,int * snp_site_coords,i { max_snp_sliding_window_counter = genome_size; } + + if(max_snp_sliding_window_counter > max_position) + { + max_position= max_snp_sliding_window_counter; + } for(j = snp_sliding_window_counter; j < max_snp_sliding_window_counter; j++) { @@ -784,7 +795,7 @@ int get_blocks(int ** block_coordinates, int genome_size,int * snp_site_coords,i int block_lower_bound = 0; // Scan across the pileup and record where blocks are above the cutoff int i; - for(i = 0; i <= genome_size; i++) + for(i = min_postion; i <= max_position; i++) { // Just entered the start of a block if(window_count[i] > cutoff && in_block == 0) @@ -794,20 +805,23 @@ int get_blocks(int ** block_coordinates, int genome_size,int * snp_site_coords,i } // Reached end of genome - if(i == genome_size && in_block == 1) + if (in_block == 1) { - block_coordinates[0][number_of_blocks] = block_lower_bound; - block_coordinates[1][number_of_blocks] = i; - number_of_blocks++; - in_block = 0; - } - // Just left a block - else if(window_count[i] <= cutoff && in_block == 1) - { - block_coordinates[0][number_of_blocks] = block_lower_bound; - block_coordinates[1][number_of_blocks] = i-1; - number_of_blocks++; - in_block = 0; + if(i == genome_size) + { + block_coordinates[0][number_of_blocks] = block_lower_bound; + block_coordinates[1][number_of_blocks] = i; + number_of_blocks++; + in_block = 0; + } + // Just left a block + else if(window_count[i] <= cutoff) + { + block_coordinates[0][number_of_blocks] = block_lower_bound; + block_coordinates[1][number_of_blocks] = i-1; + number_of_blocks++; + in_block = 0; + } } } @@ -1170,54 +1184,70 @@ double reduce_factorial(int l, int i) // c = number_of_block_snps double get_block_likelihood(int branch_genome_size, int number_of_branch_snps, int block_genome_size_without_gaps, int number_of_block_snps) { - double part1, part2, part3, part4; - - if(block_genome_size_without_gaps == 0) - { - return 0.0; - } - if(number_of_block_snps == 0) - { - return 0.0; - } - - part1 = log10(number_of_block_snps*1.0/block_genome_size_without_gaps*1.0)*number_of_block_snps; - - if((block_genome_size_without_gaps-number_of_block_snps) == 0) - { - part2 = 0.0; - } - else - { - part2 = log10( ((block_genome_size_without_gaps-number_of_block_snps)*1.0)/block_genome_size_without_gaps*1.0 )*(block_genome_size_without_gaps-number_of_block_snps); - } - - if((number_of_branch_snps-number_of_block_snps) == 0) - { - part3 = 0.0; - } - else - { - part3 = log10(((number_of_branch_snps-number_of_block_snps)*1.0)/((branch_genome_size-block_genome_size_without_gaps)*1.0))*(number_of_branch_snps-number_of_block_snps); - } - - if(((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps))==0) - { - part4 = 0.0; - } - else - { - part4=log10(((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps)*1.0 )/((branch_genome_size-block_genome_size_without_gaps)*1.0)) * ((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps)); - } - - return (part1+part2+part3+part4)*-1; + double part1, part2, part3, part4; + + if(block_genome_size_without_gaps == 0) + { + return 0.0; + } + if(number_of_block_snps == 0) + { + return 0.0; + } + + part1 = log10(number_of_block_snps*1.0/block_genome_size_without_gaps*1.0)*number_of_block_snps; + + if((block_genome_size_without_gaps-number_of_block_snps) == 0) + { + part2 = 0.0; + } + else + { + part2 = log10( ((block_genome_size_without_gaps-number_of_block_snps)*1.0)/block_genome_size_without_gaps*1.0 )*(block_genome_size_without_gaps-number_of_block_snps); + } + + if((number_of_branch_snps-number_of_block_snps) == 0) + { + part3 = 0.0; + } + else + { + part3 = log10(((number_of_branch_snps-number_of_block_snps)*1.0)/((branch_genome_size-block_genome_size_without_gaps)*1.0))*(number_of_branch_snps-number_of_block_snps); + } + + if(((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps))==0) + { + part4 = 0.0; + } + else + { + part4=log10(((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps)*1.0 )/((branch_genome_size-block_genome_size_without_gaps)*1.0)) * ((branch_genome_size-block_genome_size_without_gaps)-(number_of_branch_snps-number_of_block_snps)); + } + + return (part1+part2+part3+part4)*-1; } int calculate_genome_length_excluding_blocks_and_gaps(char * sequence, int length_of_sequence, int ** block_coordinates, int num_blocks) { - int genome_length = length_of_sequence; + // Process list of recombinations + int j = 0; + int * filtered_start_coords; + int * filtered_end_coords; + int num_filtered_blocks = 0; + filtered_start_coords = (int *) calloc(num_blocks,sizeof(int)); + filtered_end_coords = (int *) calloc(num_blocks,sizeof(int)); + for(j = 0; j= filtered_start_coords[j] && i <= filtered_end_coords[j]) { - if (i >= block_coordinates[0][j] && i <= block_coordinates[1][j]) - { - genome_length--; - } + genome_length--; } } } } + free(filtered_start_coords); + free(filtered_end_coords); + return genome_length; } - - diff --git a/src/branch_sequences.h b/src/branch_sequences.h index c62ff6a4..a93e27af 100644 --- a/src/branch_sequences.h +++ b/src/branch_sequences.h @@ -21,7 +21,7 @@ #define _BRANCH_SEQUENCES_H_ #include "seqUtil.h" #include "Newickform.h" -char *generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, char * leaf_sequence, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps, FILE * branch_snps_file_pointer, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag); +void generate_branch_sequences(newick_node *root, FILE *vcf_file_pointer,int * snp_locations, int number_of_snps, char** column_names, int number_of_columns, int length_of_original_genome, FILE * block_file_pointer, FILE * gff_file_pointer,int min_snps, FILE * branch_snps_file_pointer, int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int thread_index); void identify_recombinations(int number_of_branch_snps, int * branches_snp_sites,int length_of_original_genome); double calculate_snp_density(int * branches_snp_sites, int number_of_branch_snps, int index); void get_likelihood_for_windows(char * child_sequence, int current_num_snps, int * snp_site_coords, int branch_genome_size, int number_of_branch_snps, int * snp_locations, newick_node * current_node, FILE * block_file_pointer, newick_node *root, char * branch_snp_sequence, FILE * gff_file_pointer,int min_snps, int length_of_original_genome, char * original_sequence,int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag); @@ -30,7 +30,7 @@ int calculate_window_size(int branch_genome_size, int number_of_branch_snps,int double calculate_threshold(int branch_genome_size, int window_size, float uncorrected_p_value); int p_value_test(int branch_genome_size, int window_size, int num_branch_snps, int block_snp_count, int min_snps, float uncorrected_p_value); double reduce_factorial(int l, int i); -void fill_in_recombinations_with_gaps(newick_node *root, int * parent_recombinations, int parent_num_recombinations, int current_total_snps,int num_blocks, int ** current_block_coordinates,int length_of_original_genome,int * snp_locations, int number_of_snps); +void fill_in_recombinations_with_gaps(newick_node ** nodeArray, int node_index, int parent_node_index, int ** current_recombinations, int * num_recombinations, int * current_total_snps,int * num_blocks, int length_of_original_genome,int * snp_locations, int number_of_snps); int copy_and_concat_integer_arrays(int * array_1, int array_1_size, int * array_2, int array_2_size, int * output_array); int copy_and_concat_2d_integer_arrays(int ** array_1, int array_1_size, int ** array_2, int array_2_size, int ** output_array); double snp_density(int length_of_sequence, int number_of_snps); @@ -38,7 +38,7 @@ int calculate_cutoff(int branch_genome_size, int window_size, int num_branch_snp int get_smallest_log_likelihood(double * candidate_blocks, int number_of_candidate_blocks); int exclude_snp_sites_in_block(int window_start_coordinate, int window_end_coordinate, int * snp_site_coords, int number_of_branch_snps); int flag_smallest_log_likelihood_recombinations(int ** candidate_blocks, int number_of_candidate_blocks, int number_of_branch_snps, int * snp_site_coords, int * recombinations, int number_of_recombinations,newick_node * current_node, FILE * block_file_pointer, newick_node *root,int * snp_locations, int total_num_snps, FILE * gff_file_pointer, double * block_likelihooods); -int calculate_number_of_bases_in_recombations_excluding_gaps(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps); +int calculate_number_of_bases_in_recombinations(int ** block_coordinates, int num_blocks,char * child_sequence, int * snp_locations,int current_total_snps, int count_gaps); void carry_unambiguous_gaps_up_tree(newick_node *root); void move_blocks_inwards_while_likelihood_improves(int number_of_blocks,int ** block_coordinates, int min_snps, int * snp_site_coords, int number_of_branch_snps,char * branch_snp_sequence, int * snp_locations, int branch_genome_size,char * child_sequence, int length_of_sequence, double * block_likelihoods, int cutoff_value, float trimming_ratio); int get_blocks(int ** block_coordinates, int branch_genome_size,int * snp_site_coords,int number_of_branch_snps, int window_size, int cutoff, char * original_sequence, int * snp_locations, int number_of_snps); diff --git a/src/csv_of_snp_sites.c b/src/csv_of_snp_sites.c index b694239b..b61fa8d8 100644 --- a/src/csv_of_snp_sites.c +++ b/src/csv_of_snp_sites.c @@ -77,7 +77,6 @@ void create_csv_of_snp_sites(char filename[], int number_of_snps, char ** bases_ // Indices run consecutively, rather than using SNP locations in whole genome alignment // This is because pyjar reconstructs only the polymorphic sites, not the whole sequences indexed_pattern* base_pattern_indices = malloc(number_of_snps * sizeof(indexed_pattern)); - i = 0; for (i = 0; i < number_of_snps; i++) { base_pattern_indices[i].pattern = bases_for_snps[i]; diff --git a/src/gubbins.c b/src/gubbins.c index cdc7cdea..4d29ea14 100644 --- a/src/gubbins.c +++ b/src/gubbins.c @@ -39,16 +39,16 @@ // given a sample name extract the sequences from the vcf // compare two sequences to get pseudo sequnece and fill in with difference from reference sequence -void run_gubbins(char vcf_filename[], char tree_filename[],char multi_fasta_filename[], int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag) +void run_gubbins(char vcf_filename[], char tree_filename[],char multi_fasta_filename[], int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads) { load_sequences_from_multifasta_file(multi_fasta_filename); - extract_sequences(vcf_filename, tree_filename, multi_fasta_filename,min_snps,original_multi_fasta_filename,window_min, window_max, uncorrected_p_value, trimming_ratio, extensive_search_flag); + extract_sequences(vcf_filename, tree_filename, multi_fasta_filename,min_snps,original_multi_fasta_filename,window_min, window_max, uncorrected_p_value, trimming_ratio, extensive_search_flag, num_threads); create_tree_statistics_file(tree_filename,get_sample_statistics(),number_of_samples_from_parse_phylip()); freeup_memory(); } -void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[],int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag) +void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[],int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads) { FILE *vcf_file_pointer; vcf_file_pointer=fopen(vcf_filename, "r"); @@ -74,7 +74,7 @@ void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fast get_integers_from_column_in_vcf(vcf_file_pointer, snp_locations, number_of_snps, column_number_for_column_name(column_names, "POS", number_of_columns)); - root_node = build_newick_tree(tree_filename, vcf_file_pointer,snp_locations, number_of_snps, column_names, number_of_columns, length_of_original_genome,min_snps,window_min, window_max, uncorrected_p_value, trimming_ratio, extensive_search_flag); + root_node = build_newick_tree(tree_filename, vcf_file_pointer,snp_locations, number_of_snps, column_names, number_of_columns, length_of_original_genome,min_snps,window_min, window_max, uncorrected_p_value, trimming_ratio, extensive_search_flag, num_threads); fclose(vcf_file_pointer); int* filtered_snp_locations = calloc((number_of_snps+1), sizeof(int)); diff --git a/src/gubbins.h b/src/gubbins.h index 10c00d95..94c3dc5a 100644 --- a/src/gubbins.h +++ b/src/gubbins.h @@ -23,8 +23,8 @@ #include "seqUtil.h" #include "Newickform.h" -void run_gubbins(char vcf_filename[], char tree_filename[], char multi_fasta_filename[], int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag); -void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[], int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag); +void run_gubbins(char vcf_filename[], char tree_filename[], char multi_fasta_filename[], int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int num_threads); +void extract_sequences(char vcf_filename[], char tree_filename[],char multi_fasta_filename[], int min_snps, char original_multi_fasta_filename[], int window_min, int window_max, float uncorrected_p_value, float trimming_ratio, int extensive_search_flag, int n_threads); char find_first_real_base(int base_position, int number_of_child_sequences, char ** child_sequences); diff --git a/src/main.c b/src/main.c index 8d6ca4af..421b1a25 100644 --- a/src/main.c +++ b/src/main.c @@ -32,7 +32,7 @@ const char* program_name; // Assumptions: // The sequences in the multi fasta alignment file are the same length -// Your only interested in SNPs, INDELS are ignored +// You are only interested in SNPs, INDELS are ignored // The first sequence is chosen as the reference sequence // If there is an indel in the reference sequence, the first normal base found in another strain is used. @@ -46,9 +46,10 @@ void print_usage(FILE* stream, int exit_code) " -t Newick tree file\n" " -v VCF file\n" " -f Original Multifasta file\n" + " -n Number of threads to use for recombination detection\n" " -m Min SNPs for identifying a recombination block\n" - " -a Min window size\n" - " -b Max window size\n" + " -a Min window size\n" + " -b Max window size\n" " -p p value for detecting recombinations\n" " -i p value ratio for trimming recombinations\n" " -h Display this usage information.\n\n" @@ -86,6 +87,7 @@ int main (int argc, char ** argv) float uncorrected_p_value = 0.05; float trimming_ratio = 1.0; int extensive_search_flag = 0; + int num_threads = 1; program_name = argv[0]; while (1) @@ -103,12 +105,13 @@ int main (int argc, char ** argv) {"p_value", required_argument, 0, 'p'}, {"trimming_ratio", required_argument, 0, 'i'}, {"extended_search", required_argument, 0, 'x'}, + {"ncpu", required_argument, 0, 'n'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long (argc, argv, "hrxv:f:t:m:a:b:p:i:", + c = getopt_long (argc, argv, "hrxv:f:t:m:a:b:p:i:n:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -157,6 +160,9 @@ int main (int argc, char ** argv) case 't': memcpy(tree_filename, optarg, size_of_string(optarg) +1); break; + case 'n': + num_threads = atoi(optarg); + break; case '?': /* getopt_long already printed an error message. */ break; @@ -188,7 +194,8 @@ int main (int argc, char ** argv) window_max, uncorrected_p_value, trimming_ratio, - extensive_search_flag); + extensive_search_flag, + num_threads); } else { diff --git a/src/parse_phylip.c b/src/parse_phylip.c index 24bb2aa4..651f3960 100644 --- a/src/parse_phylip.c +++ b/src/parse_phylip.c @@ -62,17 +62,21 @@ int get_internal_node(int sequence_index) void get_sequence_for_sample_name(char * sequence_bases, char * sample_name) { int sequence_index; - sequence_index = find_sequence_index_from_sample_name( sample_name); + sequence_index = find_sequence_index_from_sample_name(sample_name); if(sequence_index < 0) { - printf("Couldnt find sequence name %s with index %d\n", sample_name,sequence_index); + printf("Could not find sequence name %s with index %d\n", sample_name,sequence_index); exit(1); } + get_sequence_for_sample_index(sequence_bases, sequence_index); + +} +void get_sequence_for_sample_index(char * sequence_bases, int sequence_index) +{ memcpy(sequence_bases, sequences[sequence_index], size_of_string(sequences[sequence_index]) +1); } - void fill_in_unambiguous_gaps_in_parent_from_children(int parent_sequence_index, int * child_sequence_indices, int num_children) { int snp_counter = 0; @@ -237,10 +241,8 @@ void filter_sequence_bases_and_rotate(char * reference_bases, char ** filtered_b } -void set_number_of_recombinations_for_sample(char * sample_name, int number_of_recombinations) +void set_number_of_recombinations_for_sample(int sample_index, int number_of_recombinations) { - int sample_index ; - sample_index = find_sequence_index_from_sample_name( sample_name); if( sample_index == -1) { return; @@ -249,15 +251,12 @@ void set_number_of_recombinations_for_sample(char * sample_name, int number_of_r } -void set_number_of_snps_for_sample(char * sample_name, int number_of_snps) +void set_number_of_snps_for_sample(int sample_index, int number_of_snps) { - int sample_index ; - sample_index = find_sequence_index_from_sample_name( sample_name); if( sample_index == -1) { return; } - ((sample_statistics *) statistics_for_samples[sample_index])->number_of_snps = number_of_snps; } @@ -275,10 +274,8 @@ void set_number_of_blocks_for_sample(char * sample_name,int num_blocks) -void set_genome_length_without_gaps_for_sample(char * sample_name, int genome_length_without_gaps) +void set_genome_length_without_gaps_for_sample(int sample_index, int genome_length_without_gaps) { - int sample_index ; - sample_index = find_sequence_index_from_sample_name( sample_name); if( sample_index == -1) { return; @@ -287,11 +284,10 @@ void set_genome_length_without_gaps_for_sample(char * sample_name, int genome_le ((sample_statistics *) statistics_for_samples[sample_index])->genome_length_without_gaps = genome_length_without_gaps; } -void set_genome_length_excluding_blocks_and_gaps_for_sample(char * sample_name, int genome_length_excluding_blocks_and_gaps) +void set_genome_length_excluding_blocks_and_gaps_for_sample(int sample_index, int genome_length_excluding_blocks_and_gaps) { - int sample_index ; - sample_index = find_sequence_index_from_sample_name( sample_name); - if( sample_index == -1) + + if( sample_index == -1) { return; } @@ -299,26 +295,34 @@ void set_genome_length_excluding_blocks_and_gaps_for_sample(char * sample_name, ((sample_statistics *) statistics_for_samples[sample_index])->genome_length_excluding_blocks_and_gaps = genome_length_excluding_blocks_and_gaps; } -void set_number_of_branch_bases_in_recombinations(char * sample_name, int bases_in_recombinations) +void set_number_of_branch_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps) { - int sample_index ; - sample_index = find_sequence_index_from_sample_name( sample_name); + if( sample_index == -1) { return; } - ((sample_statistics *) statistics_for_samples[sample_index])->branch_bases_in_recombinations = bases_in_recombinations; + + if (include_gaps > 0) { + ((sample_statistics *) statistics_for_samples[sample_index])->branch_bases_in_recombinations_including_gaps = bases_in_recombinations; + } else { + ((sample_statistics *) statistics_for_samples[sample_index])->branch_bases_in_recombinations = bases_in_recombinations; + } } -void set_number_of_bases_in_recombinations(char * sample_name, int bases_in_recombinations) +void set_number_of_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps) { - int sample_index ; - sample_index = find_sequence_index_from_sample_name( sample_name); - if( sample_index == -1) - { - return; - } + if( sample_index == -1) + { + return; + } + + if (include_gaps > 0) { + ((sample_statistics *) statistics_for_samples[sample_index])->bases_in_recombinations_including_gaps = bases_in_recombinations; + } else { ((sample_statistics *) statistics_for_samples[sample_index])->bases_in_recombinations = bases_in_recombinations; + } + } @@ -387,7 +391,6 @@ void load_sequences_from_multifasta_file(char filename[]) get_sample_names_for_header(filename, phylip_sample_names, num_samples); int l; - i = 0; int sequence_number = 0; gzFile fp; diff --git a/src/parse_phylip.h b/src/parse_phylip.h index 9be4c939..f1d489a0 100644 --- a/src/parse_phylip.h +++ b/src/parse_phylip.h @@ -27,21 +27,24 @@ int number_of_snps; int genome_length_without_gaps; int number_of_blocks; + int bases_in_recombinations_including_gaps; + int branch_bases_in_recombinations_including_gaps; int bases_in_recombinations; int branch_bases_in_recombinations; int genome_length_excluding_blocks_and_gaps; } sample_statistics; void get_sequence_for_sample_name(char * sequence_bases, char * sample_name); +void get_sequence_for_sample_index(char * sequence_bases, int sequence_index); int find_sequence_index_from_sample_name( char * sample_name); int update_sequence_base(char new_sequence_base, int sequence_index, int base_index); int does_column_contain_snps(int snp_column, char reference_base); int number_of_samples_from_parse_phylip(void); void get_sample_names_from_parse_phylip(char ** sample_names); char convert_reference_to_real_base_in_column(int snp_column, char reference_base); -void set_genome_length_without_gaps_for_sample(char * sample_name, int genome_length_without_gaps); -void set_number_of_snps_for_sample(char * sample_name, int number_of_snps); -void set_number_of_recombinations_for_sample(char * sample_name, int number_of_recombinations); +void set_genome_length_without_gaps_for_sample(int sample_index, int genome_length_without_gaps); +void set_number_of_snps_for_sample(int sample_index, int number_of_snps); +void set_number_of_recombinations_for_sample(int sample_index, int number_of_recombinations); void set_number_of_blocks_for_sample(char * sample_name,int num_blocks); sample_statistics ** get_sample_statistics(void); int number_of_snps_in_phylip(void); @@ -53,10 +56,10 @@ int get_internal_node(int sequence_index); void fill_in_unambiguous_bases_in_parent_from_children_where_parent_has_a_gap(int parent_sequence_index, int * child_sequence_indices, int num_children); void fill_in_unambiguous_gaps_in_parent_from_children(int parent_sequence_index, int * child_sequence_indices, int num_children); void freeup_memory(void); -void set_number_of_branch_bases_in_recombinations(char * sample_name, int bases_in_recombinations); -void set_number_of_bases_in_recombinations(char * sample_name, int bases_in_recombinations); +void set_number_of_branch_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps); +void set_number_of_bases_in_recombinations(int sample_index, int bases_in_recombinations, int include_gaps); void filter_sequence_bases_and_rotate(char * reference_bases, char ** filtered_bases_for_snps, int number_of_filtered_snps); -void set_genome_length_excluding_blocks_and_gaps_for_sample(char * sample_name, int genome_length_excluding_blocks_and_gaps); +void set_genome_length_excluding_blocks_and_gaps_for_sample(int sample_index, int genome_length_excluding_blocks_and_gaps); #define MAX_READ_BUFFER 65536 #define MAX_SAMPLE_NAME_SIZE 1024 diff --git a/src/snp_searching.c b/src/snp_searching.c index 0af3f3ee..ece5e192 100644 --- a/src/snp_searching.c +++ b/src/snp_searching.c @@ -55,22 +55,22 @@ int advance_window_start_to_next_snp(int window_start_coordinate, int * snp_loca int find_starting_index(int window_start_coordinate, int * snp_locations, int start_index, int end_index) { int current_index = 0; - if(start_index == end_index || start_index + 1 == end_index) { - return start_index; + return start_index; } else if(end_index < start_index) { - return end_index; + return end_index; } if( snp_locations[start_index]>= window_start_coordinate) { - return start_index; + return start_index; } current_index = (int)((end_index-start_index)/2) + start_index; + if(current_index >= end_index) { current_index = end_index-1; @@ -79,7 +79,7 @@ int find_starting_index(int window_start_coordinate, int * snp_locations, int st { current_index = start_index+1; } - + if( snp_locations[current_index] < window_start_coordinate) { start_index = current_index; @@ -92,7 +92,6 @@ int find_starting_index(int window_start_coordinate, int * snp_locations, int st { return current_index; } - return find_starting_index(window_start_coordinate, snp_locations, start_index, end_index); } @@ -212,7 +211,7 @@ int calculate_block_size_without_gaps(char * child_sequence, int * snp_locations int calculate_block_size_without_gaps_with_start_end_index(char * child_sequence, int * snp_locations, int starting_coordinate, int ending_coordinate, int current_snp_count, int start_index,int end_index) { int i; - int block_size = ending_coordinate - starting_coordinate; + int block_size = ending_coordinate - starting_coordinate + 1; // start and end coordinate are inclusive as they mark the bounding SNPs start_index = find_starting_index( starting_coordinate, snp_locations,start_index, end_index); for(i = start_index; i < current_snp_count ; i++) diff --git a/src/tree_statistics.c b/src/tree_statistics.c index bf331e33..792b8018 100644 --- a/src/tree_statistics.c +++ b/src/tree_statistics.c @@ -35,25 +35,27 @@ void create_tree_statistics_file(char filename[], sample_statistics ** statistic char extension[7] = {".stats"}; concat_strings_created_with_malloc(base_filename,extension); file_pointer = fopen(base_filename, "w"); - fprintf( file_pointer, "Node\tTotal SNPs\tNumber of SNPs Inside Recombinations\tNumber of SNPs Outside Recombinations\tNumber of Recombination Blocks\tBases in Recombinations\tCumulative Bases in Recombinations\tr/m\trho/theta\tGenome Length\tBases in Clonal Frame\n"); + fprintf( file_pointer, "Node\tTotal SNPs\tNumber of SNPs Inside Recombinations\tNumber of SNPs Outside Recombinations\tNumber of Recombination Blocks\tBases in Recombinations Including Gaps\tCumulative Bases in Recombinations Including Gaps\tBases in Recombinations Excluding Gaps\tCumulative Bases in Recombinations Excluding Gaps\tr/m\trho/theta\tGenome Length\tBases in Clonal Frame\n"); for(sample_counter=0; sample_counter< number_of_samples; sample_counter++) { sample_statistics * sample_details = ((sample_statistics *) statistics_for_samples[sample_counter]); fprintf( file_pointer, "%s\t", sample_details->sample_name); - fprintf( file_pointer, "%i\t", (sample_details->number_of_recombinations + sample_details->number_of_snps)); - fprintf( file_pointer, "%i\t", sample_details->number_of_recombinations); - fprintf( file_pointer, "%i\t", (sample_details->number_of_snps)); - fprintf( file_pointer, "%i\t", sample_details->number_of_blocks); - fprintf( file_pointer, "%i\t", sample_details->branch_bases_in_recombinations); - fprintf( file_pointer, "%i\t", sample_details->bases_in_recombinations); - fprintf( file_pointer, "%f\t", recombination_to_mutation_ratio(sample_details->number_of_recombinations, (sample_details->number_of_snps))); + fprintf( file_pointer, "%i\t", (sample_details->number_of_recombinations + sample_details->number_of_snps)); + fprintf( file_pointer, "%i\t", sample_details->number_of_recombinations); + fprintf( file_pointer, "%i\t", (sample_details->number_of_snps)); + fprintf( file_pointer, "%i\t", sample_details->number_of_blocks); + fprintf( file_pointer, "%i\t", sample_details->branch_bases_in_recombinations_including_gaps); + fprintf( file_pointer, "%i\t", sample_details->bases_in_recombinations_including_gaps); + fprintf( file_pointer, "%i\t", sample_details->branch_bases_in_recombinations); + fprintf( file_pointer, "%i\t", sample_details->bases_in_recombinations); + fprintf( file_pointer, "%f\t", recombination_to_mutation_ratio(sample_details->number_of_recombinations, (sample_details->number_of_snps))); fprintf( file_pointer, "%f\t", rho_theta(sample_details->number_of_blocks,sample_details->number_of_snps)); - fprintf( file_pointer, "%i\t", sample_details->genome_length_without_gaps); + fprintf( file_pointer, "%i\t", sample_details->genome_length_without_gaps); fprintf( file_pointer, "%i", sample_details->genome_length_excluding_blocks_and_gaps); fprintf( file_pointer, "\n"); - free(sample_details->sample_name); + free(sample_details->sample_name); free(sample_details); } diff --git a/tests/check_branch_sequences.c b/tests/check_branch_sequences.c index 52d75c22..8f712646 100644 --- a/tests/check_branch_sequences.c +++ b/tests/check_branch_sequences.c @@ -92,12 +92,12 @@ int test_bases_in_recombinations(int block_size) block_coords[1][3] = 15; char * child_sequence = "AAAAAAAAATAA"; int snp_locations[12] = {1,2,3,5,7,10,11,15,20,30,100,110}; - return calculate_number_of_bases_in_recombations_excluding_gaps(block_coords, block_size, child_sequence, snp_locations,12); + return calculate_number_of_bases_in_recombinations(block_coords, block_size, child_sequence, snp_locations,12,0); } int test_bases_in_recombinations_with_gaps(int block_size) { - int ** block_coords; + int ** block_coords; block_coords = (int **) malloc(2*sizeof(int*)); block_coords[0] = (int*) malloc((4)*sizeof(int )); block_coords[1] = (int*) malloc((4)*sizeof(int )); @@ -111,21 +111,43 @@ int test_bases_in_recombinations_with_gaps(int block_size) block_coords[1][3] = 15; char * child_sequence = "--A---AAAAAAAAAAAAAT"; int snp_locations[16] = {1,4,5,6,7,8,9,10,11,15,20,30,40,50,100,110}; - return calculate_number_of_bases_in_recombations_excluding_gaps(block_coords, block_size, child_sequence, snp_locations,16); + return calculate_number_of_bases_in_recombinations(block_coords, block_size, child_sequence, snp_locations,16,0); } +int test_bases_in_recombinations_including_gaps(int block_size) +{ + int ** block_coords; + block_coords = (int **) malloc(2*sizeof(int*)); + block_coords[0] = (int*) malloc((4)*sizeof(int )); + block_coords[1] = (int*) malloc((4)*sizeof(int )); + block_coords[0][0] = 5; + block_coords[1][0] = 10; + block_coords[0][1] = 100; + block_coords[1][1] = 110; + block_coords[0][2] = 15; + block_coords[1][2] = 20; + block_coords[0][3] = 7; + block_coords[1][3] = 15; + char * child_sequence = "--A---AAAAAAAAAAAAAT"; + int snp_locations[16] = {1,4,5,6,7,8,9,10,11,15,20,30,40,50,100,110}; + return calculate_number_of_bases_in_recombinations(block_coords, block_size, child_sequence, snp_locations,16,1); +} START_TEST (check_calculate_number_of_bases_in_recombations) { - ck_assert(test_bases_in_recombinations(4) == 25); - ck_assert(test_bases_in_recombinations(3) == 20); - ck_assert(test_bases_in_recombinations(2) == 15); - ck_assert(test_bases_in_recombinations(1) == 5); - - ck_assert(test_bases_in_recombinations_with_gaps(4) == 22); - ck_assert(test_bases_in_recombinations_with_gaps(3) == 17); - ck_assert(test_bases_in_recombinations_with_gaps(2) == 12); - ck_assert(test_bases_in_recombinations_with_gaps(1) == 2); + ck_assert(test_bases_in_recombinations(4) == 27); + ck_assert(test_bases_in_recombinations(3) == 23); + ck_assert(test_bases_in_recombinations(2) == 17); + ck_assert(test_bases_in_recombinations(1) == 6); + + ck_assert(test_bases_in_recombinations_with_gaps(4) == 24); + ck_assert(test_bases_in_recombinations_with_gaps(3) == 20); + ck_assert(test_bases_in_recombinations_with_gaps(2) == 14); + ck_assert(test_bases_in_recombinations_with_gaps(1) == 3); + + ck_assert(test_bases_in_recombinations_with_gaps(4) < test_bases_in_recombinations_including_gaps(4)); + ck_assert(test_bases_in_recombinations(4) == test_bases_in_recombinations_including_gaps(4)); + } END_TEST diff --git a/tests/check_gubbins.c b/tests/check_gubbins.c index 931745c7..3ca2fb7a 100644 --- a/tests/check_gubbins.c +++ b/tests/check_gubbins.c @@ -10,7 +10,7 @@ START_TEST (check_gubbins_no_recombinations) { remove("../tests/data/no_recombinations.tre"); cp("../tests/data/no_recombinations.tre", "../tests/data/no_recombinations.original.tre"); - run_gubbins("../tests/data/no_recombinations.aln.vcf", "../tests/data/no_recombinations.tre","../tests/data/no_recombinations.aln.snp_sites.aln",3,"../tests/data/no_recombinations.aln.snp_sites.aln",100,10000,0.05,1,0); + run_gubbins("../tests/data/no_recombinations.aln.vcf", "../tests/data/no_recombinations.tre","../tests/data/no_recombinations.aln.snp_sites.aln",3,"../tests/data/no_recombinations.aln.snp_sites.aln",100,10000,0.05,1,0,1); ck_assert(file_exists("../tests/data/no_recombinations.tre.tab") == 1); ck_assert(file_exists("../tests/data/no_recombinations.tre.vcf") == 1); ck_assert(file_exists("../tests/data/no_recombinations.tre.phylip") == 1); @@ -37,7 +37,7 @@ START_TEST (check_gubbins_one_recombination) { remove("../tests/data/one_recombination.tre"); cp("../tests/data/one_recombination.tre", "../tests/data/one_recombination.original.tre"); - run_gubbins("../tests/data/one_recombination.aln.vcf", "../tests/data/one_recombination.tre","../tests/data/one_recombination.aln.snp_sites.aln",3,"../tests/data/one_recombination.aln.snp_sites.aln",100,10000,0.05,1,0); + run_gubbins("../tests/data/one_recombination.aln.vcf", "../tests/data/one_recombination.tre","../tests/data/one_recombination.aln.snp_sites.aln",3,"../tests/data/one_recombination.aln.snp_sites.aln",100,10000,0.05,1,0,1); ck_assert(file_exists("../tests/data/one_recombination.tre.tab") == 1); ck_assert(file_exists("../tests/data/one_recombination.tre.vcf") == 1); ck_assert(file_exists("../tests/data/one_recombination.tre.phylip") == 1); @@ -73,7 +73,7 @@ START_TEST (check_gubbins_multiple_recombinations) "../tests/data/multiple_recombinations.jar.aln", 3, "../tests/data/multiple_recombinations.aln", - 30,100,0.05,1,0); + 30,100,0.05,1,0,1); ck_assert(file_exists("../tests/data/multiple_recombinations.tre.tab") == 1); ck_assert(file_exists("../tests/data/multiple_recombinations.tre.vcf") == 1); @@ -83,8 +83,9 @@ START_TEST (check_gubbins_multiple_recombinations) ck_assert(file_exists("../tests/data/multiple_recombinations.tre.snp_sites.aln") == 1); ck_assert(number_of_recombinations_in_file("../tests/data/multiple_recombinations.tre.tab") == 4); - ck_assert(compare_files("../tests/data/multiple_recombinations.tre","../tests/data/multiple_recombinations.expected.tre") == 1); - ck_assert(compare_files("../tests/data/multiple_recombinations.tre.branch_snps.tab","../tests/data/multiple_recombinations.tre.branch_snps.expected.tab") == 1); + ck_assert(compare_files("../tests/data/multiple_recombinations.tre","../tests/data/multiple_recombinations.expected.tre") == 1); + ck_assert(compare_files("../tests/data/multiple_recombinations.tre.branch_snps.tab","../tests/data/multiple_recombinations.tre.branch_snps.expected.tab") == 1); + ck_assert(compare_files("../tests/data/multiple_recombinations.tre.stats","../tests/data/multiple_recombinations.tre.expected.stats") == 1); remove("../tests/data/multiple_recombinations.tre"); remove("../tests/data/multiple_recombinations.tre.tab"); @@ -97,6 +98,40 @@ START_TEST (check_gubbins_multiple_recombinations) } END_TEST +START_TEST (check_gubbins_multithreaded) +{ + remove("../tests/data/multiple_recombinations.tre"); + cp("../tests/data/multiple_recombinations.tre", "../tests/data/multiple_recombinations.original.tre"); + + run_gubbins("../tests/data/multiple_recombinations.aln.vcf", + "../tests/data/multiple_recombinations.tre", + "../tests/data/multiple_recombinations.jar.aln", + 3, + "../tests/data/multiple_recombinations.aln", + 30,100,0.05,1,0,2); + + ck_assert(file_exists("../tests/data/multiple_recombinations.tre.tab") == 1); + ck_assert(file_exists("../tests/data/multiple_recombinations.tre.vcf") == 1); + ck_assert(file_exists("../tests/data/multiple_recombinations.tre.phylip") == 1); + ck_assert(file_exists("../tests/data/multiple_recombinations.tre.stats") == 1); + ck_assert(file_exists("../tests/data/multiple_recombinations.tre.gff") == 1); + ck_assert(file_exists("../tests/data/multiple_recombinations.tre.snp_sites.aln") == 1); + + ck_assert(number_of_recombinations_in_file("../tests/data/multiple_recombinations.tre.tab") == 4); + ck_assert(compare_files("../tests/data/multiple_recombinations.tre","../tests/data/multiple_recombinations.expected.tre") == 1); + ck_assert(compare_files("../tests/data/multiple_recombinations.tre.branch_snps.tab","../tests/data/multiple_recombinations.tre.branch_snps.expected.tab") == 1); + + remove("../tests/data/multiple_recombinations.tre"); + remove("../tests/data/multiple_recombinations.tre.tab"); + remove("../tests/data/multiple_recombinations.tre.vcf"); + remove("../tests/data/multiple_recombinations.tre.phylip"); + remove("../tests/data/multiple_recombinations.tre.stats"); + remove("../tests/data/multiple_recombinations.tre.gff"); + remove("../tests/data/multiple_recombinations.tre.snp_sites.aln"); + remove("../tests/data/multiple_recombinations.tre.branch_snps.tab"); +} +END_TEST + START_TEST (check_recombination_at_root) { remove("../tests/data/recombination_at_root/RAxML_result.recombination_at_root.iteration_1"); @@ -108,7 +143,7 @@ START_TEST (check_recombination_at_root) "../tests/data/recombination_at_root/recombination_at_root.aln.gaps.snp_sites.aln", 3, "../tests/data/recombination_at_root/recombination_at_root.aln", - 100,10000,0.05,1,0); + 100,10000,0.05,1,0,1); ck_assert(compare_files("../tests/data/recombination_at_root/RAxML_result.recombination_at_root.iteration_1.tab","../tests/data/recombination_at_root/expected_RAxML_result.recombination_at_root.iteration_1.tab") == 1); diff --git a/tests/data/multiple_recombinations.expected.tre b/tests/data/multiple_recombinations.expected.tre index 1d7f9395..21338c7a 100644 --- a/tests/data/multiple_recombinations.expected.tre +++ b/tests/data/multiple_recombinations.expected.tre @@ -1 +1 @@ -(sequence_10:8.688181,(((((sequence_7:0.000000,sequence_8:0.000000)Node_1:1.408467,sequence_9:0.273120)Node_2:1.278621,sequence_6:0.441660)Node_3:0.709665,sequence_5:0.000000)Node_4:0.810390,(sequence_1:0.266832,(sequence_3:0.000000,(sequence_2:0.000000,sequence_4:0.000000)Node_5:0.000000)Node_6:1.063758)Node_7:0.941001)Node_8:4.481039)Node_9:0.000000; \ No newline at end of file +(sequence_10:8.858538,(((((sequence_7:0.000000,sequence_8:0.000000)Node_1:1.436084,sequence_9:0.278476)Node_2:1.303692,sequence_6:0.450320)Node_3:0.723580,sequence_5:0.000000)Node_4:0.826280,(sequence_1:0.272064,(sequence_3:0.000000,(sequence_2:0.000000,sequence_4:0.000000)Node_5:0.000000)Node_6:1.084616)Node_7:0.959452)Node_8:4.568902)Node_9:0.000000; \ No newline at end of file diff --git a/tests/data/multiple_recombinations.tre.branch_snps.expected.tab b/tests/data/multiple_recombinations.tre.branch_snps.expected.tab index 26f61f5e..eb4e3b9e 100644 --- a/tests/data/multiple_recombinations.tre.branch_snps.expected.tab +++ b/tests/data/multiple_recombinations.tre.branch_snps.expected.tab @@ -1,3 +1,15 @@ +FT variation 35 +FT /node="Node_5->sequence_2" +FT /colour="4" +FT /taxa="sequence_2" +FT /parent_base="T" +FT /replace="A" +FT variation 41 +FT /node="Node_5->sequence_4" +FT /colour="4" +FT /taxa="sequence_4" +FT /parent_base="A" +FT /replace="G" FT variation 28 FT /node="Node_2->sequence_9" FT /colour="4" @@ -136,394 +148,382 @@ FT /colour="4" FT /taxa=" sequence_7 sequence_8 sequence_9" FT /parent_base="A" FT /replace="G" -FT variation 28 -FT /node="Node_4->Node_3" +FT variation 22 +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa=" sequence_7 sequence_8 sequence_9 sequence_6" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" +FT variation 35 +FT /node="Node_7->Node_6" +FT /colour="4" +FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /parent_base="A" +FT /replace="T" FT variation 51 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 53 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 54 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 55 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 56 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 57 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 58 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 59 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 60 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 61 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 62 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 63 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 64 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 65 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 66 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 67 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 69 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 70 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 71 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 73 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 74 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 75 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 76 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 77 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 78 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 79 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 80 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 81 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 82 -FT /node="Node_4->sequence_5" +FT /node="Node_7->Node_6" FT /colour="4" -FT /taxa="sequence_5" +FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" FT variation 84 -FT /node="Node_4->sequence_5" -FT /colour="4" -FT /taxa="sequence_5" -FT /parent_base="A" -FT /replace="C" -FT variation 35 -FT /node="Node_5->sequence_2" -FT /colour="4" -FT /taxa="sequence_2" -FT /parent_base="T" -FT /replace="A" -FT variation 41 -FT /node="Node_5->sequence_4" -FT /colour="4" -FT /taxa="sequence_4" -FT /parent_base="A" -FT /replace="G" -FT variation 22 FT /node="Node_7->Node_6" FT /colour="4" FT /taxa=" sequence_3 sequence_2 sequence_4" FT /parent_base="A" FT /replace="C" -FT variation 35 -FT /node="Node_7->Node_6" +FT variation 28 +FT /node="Node_4->Node_3" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa=" sequence_7 sequence_8 sequence_9 sequence_6" FT /parent_base="A" -FT /replace="T" +FT /replace="C" FT variation 51 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 53 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 54 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 55 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 56 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 57 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 58 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 59 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 60 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 61 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 62 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 63 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 64 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 65 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 66 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 67 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 69 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 70 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 71 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 73 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 74 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 75 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 76 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 77 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 78 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 79 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 80 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 81 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 82 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 84 -FT /node="Node_7->Node_6" +FT /node="Node_4->sequence_5" FT /colour="4" -FT /taxa=" sequence_3 sequence_2 sequence_4" +FT /taxa="sequence_5" FT /parent_base="A" FT /replace="C" FT variation 16 diff --git a/tests/data/multiple_recombinations.tre.expected.stats b/tests/data/multiple_recombinations.tre.expected.stats new file mode 100644 index 00000000..156a281c --- /dev/null +++ b/tests/data/multiple_recombinations.tre.expected.stats @@ -0,0 +1,20 @@ +Node Total SNPs Number of SNPs Inside Recombinations Number of SNPs Outside Recombinations Number of Recombination Blocks Bases in Recombinations Including Gaps Cumulative Bases in Recombinations Including Gaps Bases in Recombinations Excluding Gaps Cumulative Bases in Recombinations Excluding Gaps r/m rho/theta Genome Length Bases in Clonal Frame +sequence_1 0 0 0 0 0 132 0 132 0.000000 0.000000 242 110 +sequence_2 1 0 1 0 0 166 0 166 0.000000 0.000000 242 76 +sequence_3 0 0 0 0 0 166 0 166 0.000000 0.000000 242 76 +sequence_4 1 0 1 0 0 166 0 166 0.000000 0.000000 242 76 +sequence_5 30 30 0 1 34 166 34 166 0.000000 0.000000 242 110 +sequence_6 0 0 0 0 0 132 0 132 0.000000 0.000000 242 110 +sequence_7 0 0 0 0 0 153 0 153 0.000000 0.000000 242 89 +sequence_8 0 0 0 0 0 153 0 153 0.000000 0.000000 242 89 +sequence_9 1 0 1 0 0 153 0 153 0.000000 0.000000 227 74 +sequence_10 0 0 0 0 0 0 0 0 0.000000 0.000000 227 227 +Node_1 0 0 0 0 0 153 0 153 0.000000 0.000000 242 89 +Node_2 22 21 1 1 21 153 21 153 21.000000 1.000000 242 110 +Node_3 1 0 1 0 0 132 0 132 0.000000 0.000000 242 110 +Node_4 0 0 0 0 0 132 0 132 0.000000 0.000000 242 110 +Node_5 0 0 0 0 0 166 0 166 0.000000 0.000000 242 76 +Node_6 32 30 2 1 34 166 34 166 15.000000 0.500000 242 110 +Node_7 1 0 1 0 0 132 0 132 0.000000 0.000000 242 110 +Node_8 180 132 48 1 132 132 132 132 2.750000 0.020833 242 242 +Node_9 0 0 0 0 0 0 0 0 0.000000 0.000000 242 242 diff --git a/tests/data/no_recombinations.tre.branch_snps.expected.tab b/tests/data/no_recombinations.tre.branch_snps.expected.tab index 985bc620..8c768cd4 100644 --- a/tests/data/no_recombinations.tre.branch_snps.expected.tab +++ b/tests/data/no_recombinations.tre.branch_snps.expected.tab @@ -1,15 +1,15 @@ -FT variation 23 -FT /node="N2->sequence_6" -FT /colour="4" -FT /taxa="sequence_6" -FT /parent_base="G" -FT /replace="A" FT variation 40 FT /node="N6->sequence_4" FT /colour="4" FT /taxa="sequence_4" FT /parent_base="A" FT /replace="G" +FT variation 23 +FT /node="N2->sequence_6" +FT /colour="4" +FT /taxa="sequence_6" +FT /parent_base="G" +FT /replace="A" FT variation 34 FT /node="N5->sequence_2" FT /colour="4" diff --git a/tests/data/recombination_at_root/expected_RAxML_result.recombination_at_root.iteration_1.tab b/tests/data/recombination_at_root/expected_RAxML_result.recombination_at_root.iteration_1.tab index d917ab70..16350fa7 100644 --- a/tests/data/recombination_at_root/expected_RAxML_result.recombination_at_root.iteration_1.tab +++ b/tests/data/recombination_at_root/expected_RAxML_result.recombination_at_root.iteration_1.tab @@ -1,18 +1,18 @@ FT misc_feature 339..978 FT /node="Node_1->sequence_two" -FT /neg_log_likelihood="77.515346" +FT /neg_log_likelihood="77.539411" FT /colour="4" FT /taxa="sequence_two" FT /SNP_count="32" FT misc_feature 11421..12388 FT /node="Node_3->Node_2" -FT /neg_log_likelihood="266.061457" +FT /neg_log_likelihood="266.166621" FT /colour="2" FT /taxa=" sequence_four sequence_one" FT /SNP_count="209" FT misc_feature 25..429 FT /node="Node_3->Node_2" -FT /neg_log_likelihood="35.854004" +FT /neg_log_likelihood="35.864561" FT /colour="2" FT /taxa=" sequence_four sequence_one" FT /SNP_count="10"