diff --git a/VERSION b/VERSION index eb39e538..15a27998 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3 +3.3.0 diff --git a/docs/gubbins_manual.md b/docs/gubbins_manual.md index 31ecd47e..e3fd739a 100644 --- a/docs/gubbins_manual.md +++ b/docs/gubbins_manual.md @@ -52,13 +52,13 @@ These will automatically be installed within the conda environment. Please cite The required input file for Gubbins is a whole genome FASTA alignment. Each sequence should have a unique identifier, and special characters should be avoided. The sequences should only use the characters `ACGT` (DNA bases), `N` (unknown base) or `-` (alignment gap). If a starting tree is to be included, then this should be a Newick format. -The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA](https://github.com/simonrharris/SKA), which can be installed through `conda install -c bioconda ska` (SKA is included when installing Gubbins through conda). For instance, +The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA2](https://github.com/bacpop/ska.rust), which can be installed through `conda install -c bioconda ska2` (SKA2 is included when installing Gubbins through conda). For instance, ``` -generate_ska_alignment.py --reference seq_X.fa --fasta fasta_files.list --fastq fastq_files.list --out out.aln +generate_ska_alignment.py --reference seq_X.fa --input input.list --out out.aln ``` -Where `fasta_files.list` is a two column tab-delimited file containing sequence names (in the first column) and FASTA sequence assembly file paths (in the second column); `fastq_files.list` contains the same for unassembled FASTQ-format read data. +Where `input.list` is a tab-delimited file with one row per isolate. The first column should be the isolate name, and the subsequent entries on the same row should contain the corresponding sequence data - this may be a single FASTA assembly, or multiple FASTQ raw read files. The alignment will be reformatted for Gubbins (e.g. modifying isolate names and removing non-N IUPAC ambiguity codes) by this script. The alignment can then be analysed with Gubbins: diff --git a/environment.yml b/environment.yml index 758f8371..370369a1 100644 --- a/environment.yml +++ b/environment.yml @@ -34,4 +34,4 @@ dependencies: - raxml-ng=1.0.1 - fasttree=2.1.10 # Scripts - - ska + - ska2 diff --git a/python/gubbins/tests/data/corrected_alignment.aln b/python/gubbins/tests/data/corrected_alignment.aln new file mode 100644 index 00000000..fafdb3be --- /dev/null +++ b/python/gubbins/tests/data/corrected_alignment.aln @@ -0,0 +1,8 @@ +>sequence1 +AAANNNNAAA +>sequence2 +CCCCCCCCCC +>sequence3 +GGGGGGGGGG +>sequence4 +TTTTTTTTTT diff --git a/python/gubbins/tests/data/corrected_alignment.csv b/python/gubbins/tests/data/corrected_alignment.csv new file mode 100644 index 00000000..c8c5eb42 --- /dev/null +++ b/python/gubbins/tests/data/corrected_alignment.csv @@ -0,0 +1,5 @@ +isolate,A,C,G,N,T +sequence1,6,0,0,4,0 +sequence2,0,10,0,0,0 +sequence3,0,0,10,0,0 +sequence4,0,0,0,0,10 diff --git a/python/gubbins/tests/data/invalid_alignment.aln b/python/gubbins/tests/data/invalid_alignment.aln new file mode 100644 index 00000000..022422cf --- /dev/null +++ b/python/gubbins/tests/data/invalid_alignment.aln @@ -0,0 +1,8 @@ +>sequence1 +AAARRRRAAA +>sequence2 +CCCCCCCCCC +>sequence3 +GGGGGGGGGG +>sequence4 +TTTTTTTTTT diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index ecddc9f6..7a1724e1 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -19,17 +19,28 @@ class TestPythonScripts(unittest.TestCase): - ## Test the alignment_checker script - + ## Test the alignment_checker script def test_alignment_checker(self): small_aln = os.path.join(data_dir, "valid_alignment.aln") - output_file = os.path.join(working_dir, "valid_alignment_test") output_csv = os.path.join(working_dir, "valid_alignment_test.csv") test_csv = os.path.join(data_dir, "test_valid_output.csv") - aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file + aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv + subprocess.check_call(aln_cmd, shell=True) + assert self.md5_check(output_csv, test_csv) + os.remove(output_csv) + + def test_alignment_reformatting(self): + invalid_aln = os.path.join(data_dir, "invalid_alignment.aln") + output_file = os.path.join(working_dir, "invalid_alignment_test.aln") + output_csv = os.path.join(working_dir, "valid_alignment_test.csv") + test_aln = os.path.join(data_dir, "corrected_alignment.aln") + test_csv = os.path.join(data_dir, "corrected_alignment.csv") + aln_cmd = "gubbins_alignment_checker.py --aln " + invalid_aln + " --out-aln " + output_file + " --out " + output_csv subprocess.check_call(aln_cmd, shell=True) assert self.md5_check(output_csv, test_csv) + assert self.md5_check(output_file, test_aln) os.remove(output_csv) + os.remove(output_file) ## Test the clade extraction script def test_clade_extraction(self): @@ -82,9 +93,9 @@ def test_generate_ska_alignment(self): ref_seq = os.path.join(preprocess_dir, 'sequence_t1.fasta') aln_out = os.path.join(preprocess_dir, 'ska_test_aln.aln') # Script name - ska_cmd = "generate_ska_alignment.py --fasta " + fasta_loc +\ + ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\ " --reference " + ref_seq + " --out " + aln_out +\ - " --k 6" + " --k 7" subprocess.check_call(ska_cmd, shell=True) ## Now run gubbins on the aln and check all the output is produced parser = run_gubbins.parse_input_args() diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py index 73a6ab36..7c5ba35e 100755 --- a/python/scripts/generate_ska_alignment.py +++ b/python/scripts/generate_ska_alignment.py @@ -31,7 +31,7 @@ # command line parsing def get_options(): - parser = argparse.ArgumentParser(description='Generate a ska alignment from a list ' + parser = argparse.ArgumentParser(description='Generate a ska2 alignment from a list ' 'of assemblies', prog='generate_ska_alignment') @@ -39,12 +39,8 @@ def get_options(): parser.add_argument('--reference', help = 'Name of reference sequence to use for alignment', required = True) - parser.add_argument('--fasta', - help = 'Two column list of names and FASTA files to include in alignment', - default = None, - required = False) - parser.add_argument('--fastq', - help = 'Two/three column list of names and of FASTQ files to include in alignment', + parser.add_argument('--input', + help = 'List of sequence data; one row per isolate, with first column being the isolate name', default = None, required = False) parser.add_argument('--out', @@ -53,7 +49,7 @@ def get_options(): parser.add_argument('--k', help = 'Split kmer size', type = int, - default = 15) + default = 17) parser.add_argument('--threads', help = 'Number of threads to use', type = int, @@ -89,67 +85,37 @@ def ska_map_sequences(seq, k = None, ref = None): # Check if ska is installed if which('ska') is None: - sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"') + sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2"') sys.exit(1) - # Dictionary for sequence names - seq_names = {} - all_names = [] - - # Make split kmers from assemblies - fasta_names = [] - if args.fasta is not None: - # Read in FASTA assemblies - with open(args.fasta,'r') as fasta_list: - for line in fasta_list.readlines(): - info = line.strip().split() - if os.path.isfile(info[1]): - fasta_names.append(info[1]) - seq_names[info[1]] = info[0] - all_names.append(info[0]) - else: - sys.stderr.write('Unable to find file ' + info[1] + '\n') - # Sketch into split kmers - with Pool(processes = args.threads) as pool: - pool.map(partial(map_fasta_sequence, - k = args.k, - names = seq_names), - fasta_names) - - # Make split kmers from FASTQs - fastq_names = [] - if args.fastq is not None: - # Read in FASTQ reads - with open(args.fastq,'r') as fastq_list: - for line in fastq_list.readlines(): - info = line.strip().split() - if os.path.isfile(info[1]) and os.path.isfile(info[2]): - fastq_names.append((info[1],info[2])) - seq_names[info[1]] = info[0] - all_names.append(info[0]) - else: - sys.stderr.write('Unable to find files ' + info[1] + ' and ' + info[2] + '\n') - # Sketch into split kmers - with Pool(processes = args.threads) as pool: - return_codes = pool.map(partial(map_fastq_sequence, - k = args.k, - names = seq_names), - fastq_names) - - # Map sequences - with Pool(processes = args.threads) as pool: - return_codes = pool.map(partial(ska_map_sequences, - k = args.k, - ref = args.reference), - all_names) - - # Generate alignment - subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out, + # Check if k value is acceptable: + if (args.k % 2) == 0 or args.k < 5 or args.k > 63: + sys.stderr.write('k must be odd and between 5 and 63\n') + sys.exit(1) + + # Build ska sketch + subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(args.k) + \ + ' -f ' + args.input + ' --threads ' + str(args.threads), shell = True) + # Run ska mapping + if os.path.exists(args.out + '.skf'): + tmp_aln = os.path.join(os.path.dirname(args.out), 'tmp.' + os.path.basename(args.out)) + subprocess.check_output('ska map -o ' + tmp_aln + ' --threads ' + str(args.threads) + ' ' + \ + args.reference + ' ' + args.out + '.skf', + shell = True) + else: + sys.stderr.write('ska building failed\n') + sys.exit(1) + + # Clean alignment to prep for Gubbins + subprocess.check_output('gubbins_alignment_checker.py --aln ' + tmp_aln + ' --out-aln ' + args.out + \ + ' --out ' + args.out + '.csv', + shell = True) + + sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n") + # Clean up if not args.no_cleanup: - subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]), - shell = True) - subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]), + subprocess.check_output('rm ' + args.out + '.skf ' + tmp_aln, shell = True) diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py index d0d65807..1257a6d8 100755 --- a/python/scripts/gubbins_alignment_checker.py +++ b/python/scripts/gubbins_alignment_checker.py @@ -20,26 +20,35 @@ # import argparse -from asyncore import write import re from collections import Counter def parse_input_args(): parser = argparse.ArgumentParser( - description='Script to output bases in an alignment by isolate', + description='Script to evaluate and reformat an alignment prior to Gubbins analysis', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--aln', '-a', dest="aln", - help='Multifasta alignment file', required=True) - parser.add_argument('--out', '-o', + parser.add_argument('--aln', + '-a', + dest="aln", + help='Multifasta alignment filename', + required=True) + parser.add_argument('--out-aln', + dest="out_aln", + help='Reformatted alignment filename', + default = None, + required=False) + parser.add_argument('--out', + '-o', dest="out", - help="Out csv name for writing results too", required=True) + help='Output CSV filename', + required=True) return parser.parse_args() def main(input_args): - ## Lets set up the csv + # Read the number of rows in the alignment row_num = 0 tot_lines = 0 with open(input_args.aln, "r") as aln_file: @@ -48,16 +57,29 @@ def main(input_args): row_num += 1 tot_lines += 1 - # Going to use counter in a first pass to store sequence counts and the range of sequences - + # Filter alignment if requested + aln_filename = input_args.aln + if input_args.out_aln: + with open(input_args.aln, "r") as aln_file, \ + open(input_args.out_aln, "w") as out_aln_file: + for line in aln_file: + if re.search("^>", line): + name = line.replace("#","_").replace(":","_").replace(">","").rstrip().split() + out_aln_file.write('>' + name[0] + '\n') + else: + sequence = line.upper().rstrip() + sequence = re.sub('[^ACGTN-]','N',sequence) + out_aln_file.write(sequence + '\n') + aln_filename = input_args.out_aln + # Going to use counter in a first pass to store sequence counts and the range of sequences total_base_counts = [] iso_data = [] total_headers = [] tot_length_str = str(tot_lines) print("Running through alignment file: %s" % input_args.aln) print() - with open(input_args.aln, "r") as aln_file: + with open(aln_filename, "r") as aln_file: for index,line in enumerate(aln_file): num_zeros = len(tot_length_str) - len(str(index + 1)) fmt_index = (("0" * num_zeros) + str(index + 1)) @@ -78,6 +100,7 @@ def main(input_args): ## Second pass to line up all the counts across the isolates print("") print("Assessing counts...") + total_headers.sort() isolate_bases = [] for count in total_base_counts: iso_row = [] @@ -88,7 +111,7 @@ def main(input_args): total_headers.insert(0, "isolate") write_out_headers = [re.sub("-","gap",i) for i in total_headers] print("Writing out results...") - with open((input_args.out + ".csv"), "w") as output: + with open((input_args.out), "w") as output: output.write(",".join(write_out_headers) + "\n") for i, aln_row in enumerate(isolate_bases): aln_row_str = list(map(str, aln_row))