From 5f01b031aa04cd5f18ece95cbb3af8526febc542 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 4 May 2023 17:08:00 +0100 Subject: [PATCH 01/13] Change alignment checker output file names --- python/gubbins/tests/test_python_scripts.py | 2 +- python/scripts/gubbins_alignment_checker.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index ecddc9f6..f129623d 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -23,7 +23,7 @@ class TestPythonScripts(unittest.TestCase): def test_alignment_checker(self): small_aln = os.path.join(data_dir, "valid_alignment.aln") - output_file = os.path.join(working_dir, "valid_alignment_test") + output_file = os.path.join(working_dir, "valid_alignment_test.csv") output_csv = os.path.join(working_dir, "valid_alignment_test.csv") test_csv = os.path.join(data_dir, "test_valid_output.csv") aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py index d0d65807..28851614 100755 --- a/python/scripts/gubbins_alignment_checker.py +++ b/python/scripts/gubbins_alignment_checker.py @@ -31,6 +31,8 @@ def parse_input_args(): formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--aln', '-a', dest="aln", help='Multifasta alignment file', required=True) + parser.add_argument('--out-aln', '-a', dest="aln", + help='Reformatted alignment file', required=False) parser.add_argument('--out', '-o', dest="out", help="Out csv name for writing results too", required=True) @@ -49,8 +51,6 @@ def main(input_args): tot_lines += 1 # Going to use counter in a first pass to store sequence counts and the range of sequences - - total_base_counts = [] iso_data = [] total_headers = [] @@ -88,7 +88,7 @@ def main(input_args): total_headers.insert(0, "isolate") write_out_headers = [re.sub("-","gap",i) for i in total_headers] print("Writing out results...") - with open((input_args.out + ".csv"), "w") as output: + with open((input_args.out), "w") as output: output.write(",".join(write_out_headers) + "\n") for i, aln_row in enumerate(isolate_bases): aln_row_str = list(map(str, aln_row)) From 425fc4383632eb1c65c7c90c1ef5645ca0275b37 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 4 May 2023 17:16:13 +0100 Subject: [PATCH 02/13] Update command line parsing --- python/scripts/gubbins_alignment_checker.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py index 28851614..0e1b135c 100755 --- a/python/scripts/gubbins_alignment_checker.py +++ b/python/scripts/gubbins_alignment_checker.py @@ -27,15 +27,22 @@ def parse_input_args(): parser = argparse.ArgumentParser( - description='Script to output bases in an alignment by isolate', + description='Script to evaluate and reformat an alignment prior to Gubbins analysis', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--aln', '-a', dest="aln", - help='Multifasta alignment file', required=True) - parser.add_argument('--out-aln', '-a', dest="aln", - help='Reformatted alignment file', required=False) - parser.add_argument('--out', '-o', + parser.add_argument('--aln', + '-a', + dest="aln", + help='Multifasta alignment filename', + required=True) + parser.add_argument('--out-aln', + dest="out_aln", + help='Reformatted alignment filename', + required=False) + parser.add_argument('--out', + '-o', dest="out", - help="Out csv name for writing results too", required=True) + help='Output CSV filename', + required=True) return parser.parse_args() From 52ad7dd7a339770628b4201112d8bb1db87ee867 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 4 May 2023 17:47:03 +0100 Subject: [PATCH 03/13] Enable reformatting of alignments --- python/scripts/gubbins_alignment_checker.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py index 0e1b135c..76910d42 100755 --- a/python/scripts/gubbins_alignment_checker.py +++ b/python/scripts/gubbins_alignment_checker.py @@ -37,6 +37,7 @@ def parse_input_args(): parser.add_argument('--out-aln', dest="out_aln", help='Reformatted alignment filename', + default = None, required=False) parser.add_argument('--out', '-o', @@ -48,7 +49,7 @@ def parse_input_args(): def main(input_args): - ## Lets set up the csv + # Read the number of rows in the alignment row_num = 0 tot_lines = 0 with open(input_args.aln, "r") as aln_file: @@ -57,6 +58,21 @@ def main(input_args): row_num += 1 tot_lines += 1 + # Filter alignment if requested + aln_filename = input_args.aln + if input_args.out_aln: + with open(input_args.aln, "r") as aln_file, \ + open(input_args.out_aln, "w") as out_aln_file: + for line in aln_file: + if re.search("^>", line): + name = line.replace("#","_").replace(":","_").replace(">","").rstrip().split() + out_aln_file.write('>' + name[0] + '\n') + else: + sequence = line.upper().rstrip() + sequence = re.sub('[^ACGTN-]','N',sequence) + out_aln_file.write(sequence + '\n') + aln_filename = input_args.out_aln + # Going to use counter in a first pass to store sequence counts and the range of sequences total_base_counts = [] iso_data = [] @@ -64,7 +80,7 @@ def main(input_args): tot_length_str = str(tot_lines) print("Running through alignment file: %s" % input_args.aln) print() - with open(input_args.aln, "r") as aln_file: + with open(aln_filename, "r") as aln_file: for index,line in enumerate(aln_file): num_zeros = len(tot_length_str) - len(str(index + 1)) fmt_index = (("0" * num_zeros) + str(index + 1)) From 6999fbf768d7f81aca68b91c258ccd46f99d565f Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 4 May 2023 18:02:16 +0100 Subject: [PATCH 04/13] Add test for alignment reformatting --- .../gubbins/tests/data/corrected_alignment.aln | 8 ++++++++ .../gubbins/tests/data/corrected_alignment.csv | 5 +++++ python/gubbins/tests/data/invalid_alignment.aln | 8 ++++++++ python/gubbins/tests/test_python_scripts.py | 16 ++++++++++++++-- 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 python/gubbins/tests/data/corrected_alignment.aln create mode 100644 python/gubbins/tests/data/corrected_alignment.csv create mode 100644 python/gubbins/tests/data/invalid_alignment.aln diff --git a/python/gubbins/tests/data/corrected_alignment.aln b/python/gubbins/tests/data/corrected_alignment.aln new file mode 100644 index 00000000..fafdb3be --- /dev/null +++ b/python/gubbins/tests/data/corrected_alignment.aln @@ -0,0 +1,8 @@ +>sequence1 +AAANNNNAAA +>sequence2 +CCCCCCCCCC +>sequence3 +GGGGGGGGGG +>sequence4 +TTTTTTTTTT diff --git a/python/gubbins/tests/data/corrected_alignment.csv b/python/gubbins/tests/data/corrected_alignment.csv new file mode 100644 index 00000000..91e775f1 --- /dev/null +++ b/python/gubbins/tests/data/corrected_alignment.csv @@ -0,0 +1,5 @@ +isolate,A,N,C,G,T +sequence1,6,4,0,0,0 +sequence2,0,0,10,0,0 +sequence3,0,0,0,10,0 +sequence4,0,0,0,0,10 diff --git a/python/gubbins/tests/data/invalid_alignment.aln b/python/gubbins/tests/data/invalid_alignment.aln new file mode 100644 index 00000000..022422cf --- /dev/null +++ b/python/gubbins/tests/data/invalid_alignment.aln @@ -0,0 +1,8 @@ +>sequence1 +AAARRRRAAA +>sequence2 +CCCCCCCCCC +>sequence3 +GGGGGGGGGG +>sequence4 +TTTTTTTTTT diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index f129623d..889b6d44 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -19,8 +19,7 @@ class TestPythonScripts(unittest.TestCase): - ## Test the alignment_checker script - + ## Test the alignment_checker script def test_alignment_checker(self): small_aln = os.path.join(data_dir, "valid_alignment.aln") output_file = os.path.join(working_dir, "valid_alignment_test.csv") @@ -31,6 +30,19 @@ def test_alignment_checker(self): assert self.md5_check(output_csv, test_csv) os.remove(output_csv) + def test_alignment_reformatting(self): + invalid_aln = os.path.join(data_dir, "invalid_alignment.aln") + output_file = os.path.join(working_dir, "invalid_alignment_test.aln") + output_csv = os.path.join(working_dir, "valid_alignment_test.csv") + test_aln = os.path.join(data_dir, "corrected_alignment.aln") + test_csv = os.path.join(data_dir, "corrected_alignment.csv") + aln_cmd = "gubbins_alignment_checker.py --aln " + invalid_aln + " --out-aln " + output_file + " --out " + output_csv + subprocess.check_call(aln_cmd, shell=True) + assert self.md5_check(output_csv, test_csv) + assert self.md5_check(output_file, test_aln) + os.remove(output_csv) + os.remove(output_file) + ## Test the clade extraction script def test_clade_extraction(self): multiple_aln = os.path.join(data_dir, "multiple_recombinations.aln") From 31c80fe8b4c30ff221300a14d804e34ed2625c4a Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Thu, 4 May 2023 21:45:37 +0100 Subject: [PATCH 05/13] Upgrade to ska2 --- python/scripts/generate_ska_alignment.py | 75 ++++-------------------- 1 file changed, 13 insertions(+), 62 deletions(-) diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py index 73a6ab36..97182283 100755 --- a/python/scripts/generate_ska_alignment.py +++ b/python/scripts/generate_ska_alignment.py @@ -31,7 +31,7 @@ # command line parsing def get_options(): - parser = argparse.ArgumentParser(description='Generate a ska alignment from a list ' + parser = argparse.ArgumentParser(description='Generate a ska2 alignment from a list ' 'of assemblies', prog='generate_ska_alignment') @@ -39,12 +39,8 @@ def get_options(): parser.add_argument('--reference', help = 'Name of reference sequence to use for alignment', required = True) - parser.add_argument('--fasta', - help = 'Two column list of names and FASTA files to include in alignment', - default = None, - required = False) - parser.add_argument('--fastq', - help = 'Two/three column list of names and of FASTQ files to include in alignment', + parser.add_argument('--input', + help = 'List of sequence data; one row per isolate, with first column being the isolate's name', default = None, required = False) parser.add_argument('--out', @@ -53,7 +49,7 @@ def get_options(): parser.add_argument('--k', help = 'Split kmer size', type = int, - default = 15) + default = 17) parser.add_argument('--threads', help = 'Number of threads to use', type = int, @@ -89,62 +85,17 @@ def ska_map_sequences(seq, k = None, ref = None): # Check if ska is installed if which('ska') is None: - sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"') + sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2") sys.exit(1) - # Dictionary for sequence names - seq_names = {} - all_names = [] - - # Make split kmers from assemblies - fasta_names = [] - if args.fasta is not None: - # Read in FASTA assemblies - with open(args.fasta,'r') as fasta_list: - for line in fasta_list.readlines(): - info = line.strip().split() - if os.path.isfile(info[1]): - fasta_names.append(info[1]) - seq_names[info[1]] = info[0] - all_names.append(info[0]) - else: - sys.stderr.write('Unable to find file ' + info[1] + '\n') - # Sketch into split kmers - with Pool(processes = args.threads) as pool: - pool.map(partial(map_fasta_sequence, - k = args.k, - names = seq_names), - fasta_names) - - # Make split kmers from FASTQs - fastq_names = [] - if args.fastq is not None: - # Read in FASTQ reads - with open(args.fastq,'r') as fastq_list: - for line in fastq_list.readlines(): - info = line.strip().split() - if os.path.isfile(info[1]) and os.path.isfile(info[2]): - fastq_names.append((info[1],info[2])) - seq_names[info[1]] = info[0] - all_names.append(info[0]) - else: - sys.stderr.write('Unable to find files ' + info[1] + ' and ' + info[2] + '\n') - # Sketch into split kmers - with Pool(processes = args.threads) as pool: - return_codes = pool.map(partial(map_fastq_sequence, - k = args.k, - names = seq_names), - fastq_names) - - # Map sequences - with Pool(processes = args.threads) as pool: - return_codes = pool.map(partial(ska_map_sequences, - k = args.k, - ref = args.reference), - all_names) - - # Generate alignment - subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out, + # Build ska sketch + subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(k) + \ + ' -f ' + args.input + ' --threads ' + str(args.threads), + shell = True) + + # Run ska mapping + subprocess.check_output('ska map -o tmp.' + args.out + ' -r ' + ref + \ + ' --threads ' + str(args.threads) + ' ' + args.out + '.skf', shell = True) # Clean up From 1069d03bd52bbacd6f538340d2769eefdccfdce7 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 05:23:47 +0100 Subject: [PATCH 06/13] Update ska alignment testing script --- python/gubbins/tests/test_python_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index 889b6d44..8baead2d 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -94,7 +94,7 @@ def test_generate_ska_alignment(self): ref_seq = os.path.join(preprocess_dir, 'sequence_t1.fasta') aln_out = os.path.join(preprocess_dir, 'ska_test_aln.aln') # Script name - ska_cmd = "generate_ska_alignment.py --fasta " + fasta_loc +\ + ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\ " --reference " + ref_seq + " --out " + aln_out +\ " --k 6" subprocess.check_call(ska_cmd, shell=True) From 12747bdd00e9c697136a4418eff555abba745cf1 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 06:11:55 +0100 Subject: [PATCH 07/13] Update alignment generation and formatting --- python/scripts/generate_ska_alignment.py | 31 +++++++++++++++------ python/scripts/gubbins_alignment_checker.py | 1 - 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py index 97182283..7c5ba35e 100755 --- a/python/scripts/generate_ska_alignment.py +++ b/python/scripts/generate_ska_alignment.py @@ -40,7 +40,7 @@ def get_options(): help = 'Name of reference sequence to use for alignment', required = True) parser.add_argument('--input', - help = 'List of sequence data; one row per isolate, with first column being the isolate's name', + help = 'List of sequence data; one row per isolate, with first column being the isolate name', default = None, required = False) parser.add_argument('--out', @@ -85,22 +85,37 @@ def ska_map_sequences(seq, k = None, ref = None): # Check if ska is installed if which('ska') is None: - sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2") + sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2"') + sys.exit(1) + + # Check if k value is acceptable: + if (args.k % 2) == 0 or args.k < 5 or args.k > 63: + sys.stderr.write('k must be odd and between 5 and 63\n') sys.exit(1) # Build ska sketch - subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(k) + \ + subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(args.k) + \ ' -f ' + args.input + ' --threads ' + str(args.threads), shell = True) # Run ska mapping - subprocess.check_output('ska map -o tmp.' + args.out + ' -r ' + ref + \ - ' --threads ' + str(args.threads) + ' ' + args.out + '.skf', + if os.path.exists(args.out + '.skf'): + tmp_aln = os.path.join(os.path.dirname(args.out), 'tmp.' + os.path.basename(args.out)) + subprocess.check_output('ska map -o ' + tmp_aln + ' --threads ' + str(args.threads) + ' ' + \ + args.reference + ' ' + args.out + '.skf', + shell = True) + else: + sys.stderr.write('ska building failed\n') + sys.exit(1) + + # Clean alignment to prep for Gubbins + subprocess.check_output('gubbins_alignment_checker.py --aln ' + tmp_aln + ' --out-aln ' + args.out + \ + ' --out ' + args.out + '.csv', shell = True) + + sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n") # Clean up if not args.no_cleanup: - subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]), - shell = True) - subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]), + subprocess.check_output('rm ' + args.out + '.skf ' + tmp_aln, shell = True) diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py index 76910d42..226f3907 100755 --- a/python/scripts/gubbins_alignment_checker.py +++ b/python/scripts/gubbins_alignment_checker.py @@ -20,7 +20,6 @@ # import argparse -from asyncore import write import re from collections import Counter From a489caf9e1a281c9dc9ae088f9240e18db51b536 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 06:32:59 +0100 Subject: [PATCH 08/13] Change k-mer length --- python/gubbins/tests/test_python_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index 8baead2d..3c14bbc4 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -96,7 +96,7 @@ def test_generate_ska_alignment(self): # Script name ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\ " --reference " + ref_seq + " --out " + aln_out +\ - " --k 6" + " --k 7" subprocess.check_call(ska_cmd, shell=True) ## Now run gubbins on the aln and check all the output is produced parser = run_gubbins.parse_input_args() From d066650c080caff10ceb3aaf9eba3eab34246418 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 06:37:05 +0100 Subject: [PATCH 09/13] Update versions for alignment generation --- VERSION | 2 +- environment.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index eb39e538..15a27998 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3 +3.3.0 diff --git a/environment.yml b/environment.yml index 758f8371..370369a1 100644 --- a/environment.yml +++ b/environment.yml @@ -34,4 +34,4 @@ dependencies: - raxml-ng=1.0.1 - fasttree=2.1.10 # Scripts - - ska + - ska2 From c7adbb70ddc8302a5b91281a91e606e896a09eb0 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 07:07:10 +0100 Subject: [PATCH 10/13] Rearrange alignment summary output file --- python/gubbins/tests/data/corrected_alignment.csv | 8 ++++---- python/gubbins/tests/test_python_scripts.py | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/python/gubbins/tests/data/corrected_alignment.csv b/python/gubbins/tests/data/corrected_alignment.csv index 91e775f1..c8c5eb42 100644 --- a/python/gubbins/tests/data/corrected_alignment.csv +++ b/python/gubbins/tests/data/corrected_alignment.csv @@ -1,5 +1,5 @@ -isolate,A,N,C,G,T -sequence1,6,4,0,0,0 -sequence2,0,0,10,0,0 -sequence3,0,0,0,10,0 +isolate,A,C,G,N,T +sequence1,6,0,0,4,0 +sequence2,0,10,0,0,0 +sequence3,0,0,10,0,0 sequence4,0,0,0,0,10 diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index 3c14bbc4..62e3fe0e 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -22,12 +22,11 @@ class TestPythonScripts(unittest.TestCase): ## Test the alignment_checker script def test_alignment_checker(self): small_aln = os.path.join(data_dir, "valid_alignment.aln") - output_file = os.path.join(working_dir, "valid_alignment_test.csv") output_csv = os.path.join(working_dir, "valid_alignment_test.csv") test_csv = os.path.join(data_dir, "test_valid_output.csv") - aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file + aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv subprocess.check_call(aln_cmd, shell=True) - assert self.md5_check(output_csv, test_csv) + assert self.md5_check(output_file, test_csv) os.remove(output_csv) def test_alignment_reformatting(self): From 5434e031f58e89a1d8ce4605cbcb137d74cbe64d Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 07:16:29 +0100 Subject: [PATCH 11/13] Fix filename variable --- python/gubbins/tests/test_python_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py index 62e3fe0e..7a1724e1 100644 --- a/python/gubbins/tests/test_python_scripts.py +++ b/python/gubbins/tests/test_python_scripts.py @@ -26,7 +26,7 @@ def test_alignment_checker(self): test_csv = os.path.join(data_dir, "test_valid_output.csv") aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv subprocess.check_call(aln_cmd, shell=True) - assert self.md5_check(output_file, test_csv) + assert self.md5_check(output_csv, test_csv) os.remove(output_csv) def test_alignment_reformatting(self): From e02866ed85da12435101155e1d794e3751331184 Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 07:41:47 +0100 Subject: [PATCH 12/13] Sort CSV headers --- python/scripts/gubbins_alignment_checker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py index 226f3907..1257a6d8 100755 --- a/python/scripts/gubbins_alignment_checker.py +++ b/python/scripts/gubbins_alignment_checker.py @@ -100,6 +100,7 @@ def main(input_args): ## Second pass to line up all the counts across the isolates print("") print("Assessing counts...") + total_headers.sort() isolate_bases = [] for count in total_base_counts: iso_row = [] From c2d0ac0da9b4eee86ab099423c148f1541152a1c Mon Sep 17 00:00:00 2001 From: Nick Croucher Date: Fri, 5 May 2023 07:56:28 +0100 Subject: [PATCH 13/13] Update for new alignment generation --- docs/gubbins_manual.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/gubbins_manual.md b/docs/gubbins_manual.md index 31ecd47e..e3fd739a 100644 --- a/docs/gubbins_manual.md +++ b/docs/gubbins_manual.md @@ -52,13 +52,13 @@ These will automatically be installed within the conda environment. Please cite The required input file for Gubbins is a whole genome FASTA alignment. Each sequence should have a unique identifier, and special characters should be avoided. The sequences should only use the characters `ACGT` (DNA bases), `N` (unknown base) or `-` (alignment gap). If a starting tree is to be included, then this should be a Newick format. -The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA](https://github.com/simonrharris/SKA), which can be installed through `conda install -c bioconda ska` (SKA is included when installing Gubbins through conda). For instance, +The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA2](https://github.com/bacpop/ska.rust), which can be installed through `conda install -c bioconda ska2` (SKA2 is included when installing Gubbins through conda). For instance, ``` -generate_ska_alignment.py --reference seq_X.fa --fasta fasta_files.list --fastq fastq_files.list --out out.aln +generate_ska_alignment.py --reference seq_X.fa --input input.list --out out.aln ``` -Where `fasta_files.list` is a two column tab-delimited file containing sequence names (in the first column) and FASTA sequence assembly file paths (in the second column); `fastq_files.list` contains the same for unassembled FASTQ-format read data. +Where `input.list` is a tab-delimited file with one row per isolate. The first column should be the isolate name, and the subsequent entries on the same row should contain the corresponding sequence data - this may be a single FASTA assembly, or multiple FASTQ raw read files. The alignment will be reformatted for Gubbins (e.g. modifying isolate names and removing non-N IUPAC ambiguity codes) by this script. The alignment can then be analysed with Gubbins: