Merge pull request #367 from nickjcroucher/alignment_updating

Update alignment generation methods
nickjcroucher · May 5, 2023 · 2c3c7c2 · 2c3c7c2
2 parents f7ef47e + c2d0ac0
commit 2c3c7c2
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 87 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.3
+3.3.0
diff --git a/docs/gubbins_manual.md b/docs/gubbins_manual.md
@@ -52,13 +52,13 @@ These will automatically be installed within the conda environment. Please cite
 
 The required input file for Gubbins is a whole genome FASTA alignment. Each sequence should have a unique identifier, and special characters should be avoided. The sequences should only use the characters `ACGT` (DNA bases), `N` (unknown base) or `-` (alignment gap). If a starting tree is to be included, then this should be a Newick format.
 
-The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA](https://github.com/simonrharris/SKA), which can be installed through `conda install -c bioconda ska` (SKA is included when installing Gubbins through conda). For instance,
+The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA2](https://github.com/bacpop/ska.rust), which can be installed through `conda install -c bioconda ska2` (SKA2 is included when installing Gubbins through conda). For instance,
 
 ```
-generate_ska_alignment.py --reference seq_X.fa --fasta fasta_files.list --fastq fastq_files.list --out out.aln
+generate_ska_alignment.py --reference seq_X.fa --input input.list --out out.aln
 ```
 
-Where `fasta_files.list` is a two column tab-delimited file containing sequence names (in the first column) and FASTA sequence assembly file paths (in the second column); `fastq_files.list` contains the same for unassembled FASTQ-format read data.
+Where `input.list` is a tab-delimited file with one row per isolate. The first column should be the isolate name, and the subsequent entries on the same row should contain the corresponding sequence data - this may be a single FASTA assembly, or multiple FASTQ raw read files. The alignment will be reformatted for Gubbins (e.g. modifying isolate names and removing non-N IUPAC ambiguity codes) by this script.
 
 The alignment can then be analysed with Gubbins:
 

diff --git a/environment.yml b/environment.yml
@@ -34,4 +34,4 @@ dependencies:
   - raxml-ng=1.0.1
   - fasttree=2.1.10
 # Scripts
-  - ska
+  - ska2
diff --git a/python/gubbins/tests/data/corrected_alignment.aln b/python/gubbins/tests/data/corrected_alignment.aln
@@ -0,0 +1,8 @@
+>sequence1
+AAANNNNAAA
+>sequence2
+CCCCCCCCCC
+>sequence3
+GGGGGGGGGG
+>sequence4
+TTTTTTTTTT
diff --git a/python/gubbins/tests/data/corrected_alignment.csv b/python/gubbins/tests/data/corrected_alignment.csv
@@ -0,0 +1,5 @@
+isolate,A,C,G,N,T
+sequence1,6,0,0,4,0
+sequence2,0,10,0,0,0
+sequence3,0,0,10,0,0
+sequence4,0,0,0,0,10
diff --git a/python/gubbins/tests/data/invalid_alignment.aln b/python/gubbins/tests/data/invalid_alignment.aln
@@ -0,0 +1,8 @@
+>sequence1
+AAARRRRAAA
+>sequence2
+CCCCCCCCCC
+>sequence3
+GGGGGGGGGG
+>sequence4
+TTTTTTTTTT
diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
@@ -19,17 +19,28 @@
 
 class TestPythonScripts(unittest.TestCase):
 
-    ## Test the alignment_checker script 
-
+    ## Test the alignment_checker script
     def test_alignment_checker(self):
         small_aln = os.path.join(data_dir, "valid_alignment.aln")
-        output_file = os.path.join(working_dir, "valid_alignment_test")
         output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
         test_csv = os.path.join(data_dir, "test_valid_output.csv")
-        aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file
+        aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv
+        subprocess.check_call(aln_cmd, shell=True)
+        assert self.md5_check(output_csv, test_csv)
+        os.remove(output_csv)
+
+    def test_alignment_reformatting(self):
+        invalid_aln = os.path.join(data_dir, "invalid_alignment.aln")
+        output_file = os.path.join(working_dir, "invalid_alignment_test.aln")
+        output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
+        test_aln = os.path.join(data_dir, "corrected_alignment.aln")
+        test_csv = os.path.join(data_dir, "corrected_alignment.csv")
+        aln_cmd = "gubbins_alignment_checker.py --aln " + invalid_aln + " --out-aln " + output_file + " --out " + output_csv
         subprocess.check_call(aln_cmd, shell=True)
         assert self.md5_check(output_csv, test_csv)
+        assert self.md5_check(output_file, test_aln)
         os.remove(output_csv)
+        os.remove(output_file)
 
     ## Test the clade extraction script
     def test_clade_extraction(self):
@@ -82,9 +93,9 @@ def test_generate_ska_alignment(self):
         ref_seq = os.path.join(preprocess_dir, 'sequence_t1.fasta')
         aln_out = os.path.join(preprocess_dir, 'ska_test_aln.aln')
         # Script name
-        ska_cmd = "generate_ska_alignment.py --fasta " + fasta_loc +\
+        ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\
             " --reference " + ref_seq + " --out " + aln_out +\
-                " --k 6"
+                " --k 7"
         subprocess.check_call(ska_cmd, shell=True)
         ## Now run gubbins on the aln and check all the output is produced 
         parser = run_gubbins.parse_input_args()

diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py
@@ -31,20 +31,16 @@
 # command line parsing
 def get_options():
 
-    parser = argparse.ArgumentParser(description='Generate a ska alignment from a list '
+    parser = argparse.ArgumentParser(description='Generate a ska2 alignment from a list '
                                                     'of assemblies',
                                      prog='generate_ska_alignment')
 
     # input options
     parser.add_argument('--reference',
                         help = 'Name of reference sequence to use for alignment',
                         required = True)
-    parser.add_argument('--fasta',
-                        help = 'Two column list of names and FASTA files to include in alignment',
-                        default = None,
-                        required = False)
-    parser.add_argument('--fastq',
-                        help = 'Two/three column list of names and of FASTQ files to include in alignment',
+    parser.add_argument('--input',
+                        help = 'List of sequence data; one row per isolate, with first column being the isolate name',
                         default = None,
                         required = False)
     parser.add_argument('--out',
@@ -53,7 +49,7 @@ def get_options():
     parser.add_argument('--k',
                         help = 'Split kmer size',
                         type = int,
-                        default = 15)
+                        default = 17)
     parser.add_argument('--threads',
                         help = 'Number of threads to use',
                         type = int,
@@ -89,67 +85,37 @@ def ska_map_sequences(seq, k = None, ref = None):
 
     # Check if ska is installed
     if which('ska') is None:
-        sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"')
+        sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2"')
         sys.exit(1)
 
-    # Dictionary for sequence names
-    seq_names = {}
-    all_names = []
-
-    # Make split kmers from assemblies
-    fasta_names = []
-    if args.fasta is not None:
-        # Read in FASTA assemblies
-        with open(args.fasta,'r') as fasta_list:
-            for line in fasta_list.readlines():
-                info = line.strip().split()
-                if os.path.isfile(info[1]):
-                    fasta_names.append(info[1])
-                    seq_names[info[1]] = info[0]
-                    all_names.append(info[0])
-                else:
-                    sys.stderr.write('Unable to find file ' + info[1] + '\n')
-        # Sketch into split kmers
-        with Pool(processes = args.threads) as pool:
-            pool.map(partial(map_fasta_sequence,
-                                k = args.k,
-                                names = seq_names),
-                                fasta_names)
-
-    # Make split kmers from FASTQs
-    fastq_names = []
-    if args.fastq is not None:
-        # Read in FASTQ reads
-        with open(args.fastq,'r') as fastq_list:
-            for line in fastq_list.readlines():
-                info = line.strip().split()
-                if os.path.isfile(info[1]) and os.path.isfile(info[2]):
-                    fastq_names.append((info[1],info[2]))
-                    seq_names[info[1]] = info[0]
-                    all_names.append(info[0])
-                else:
-                    sys.stderr.write('Unable to find files ' + info[1] + ' and ' + info[2] + '\n')
-        # Sketch into split kmers
-        with Pool(processes = args.threads) as pool:
-            return_codes = pool.map(partial(map_fastq_sequence,
-                                            k = args.k,
-                                            names = seq_names),
-                                            fastq_names)
-
-    # Map sequences
-    with Pool(processes = args.threads) as pool:
-        return_codes = pool.map(partial(ska_map_sequences,
-                                        k = args.k,
-                                        ref = args.reference),
-                                        all_names)
-
-    # Generate alignment
-    subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out,
+    # Check if k value is acceptable:
+    if (args.k % 2) == 0 or args.k < 5 or args.k > 63:
+        sys.stderr.write('k must be odd and between 5 and 63\n')
+        sys.exit(1)
+
+    # Build ska sketch
+    subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(args.k) + \
+                            ' -f ' + args.input + ' --threads ' + str(args.threads),
                             shell = True)
 
+    # Run ska mapping
+    if os.path.exists(args.out + '.skf'):
+        tmp_aln = os.path.join(os.path.dirname(args.out), 'tmp.' + os.path.basename(args.out))
+        subprocess.check_output('ska map -o ' + tmp_aln + ' --threads ' + str(args.threads) + ' ' +  \
+                                    args.reference + ' ' + args.out + '.skf',
+                                shell = True)
+    else:
+        sys.stderr.write('ska building failed\n')
+        sys.exit(1)
+
+    # Clean alignment to prep for Gubbins
+    subprocess.check_output('gubbins_alignment_checker.py --aln ' + tmp_aln + ' --out-aln '  + args.out + \
+                             ' --out ' + args.out + '.csv',
+                            shell = True)
+
+    sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n")
+
     # Clean up
     if not args.no_cleanup:
-        subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]),
-                                shell = True)
-        subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]),
+        subprocess.check_output('rm ' + args.out + '.skf ' + tmp_aln,
                                 shell = True)
diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py
@@ -20,26 +20,35 @@
 #
 
 import argparse
-from asyncore import write
 import re
 from collections import Counter
 
 def parse_input_args():
 
     parser = argparse.ArgumentParser(
-        description='Script to output bases in an alignment by isolate',
+        description='Script to evaluate and reformat an alignment prior to Gubbins analysis',
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--aln', '-a', dest="aln",
-                        help='Multifasta alignment file', required=True)
-    parser.add_argument('--out', '-o',
+    parser.add_argument('--aln',
+                        '-a',
+                        dest="aln",
+                        help='Multifasta alignment filename',
+                        required=True)
+    parser.add_argument('--out-aln',
+                        dest="out_aln",
+                        help='Reformatted alignment filename',
+                        default = None,
+                        required=False)
+    parser.add_argument('--out',
+                        '-o',
                         dest="out",
-                        help="Out csv name for writing results too", required=True)
+                        help='Output CSV filename',
+                        required=True)
 
     return parser.parse_args()
 
 def main(input_args):
 
-    ## Lets set up the csv
+    # Read the number of rows in the alignment
     row_num = 0
     tot_lines = 0
     with open(input_args.aln, "r") as aln_file:
@@ -48,16 +57,29 @@ def main(input_args):
                 row_num += 1
             tot_lines += 1
 
-    # Going to use counter in a first pass to store sequence counts and the range of sequences
-
+    # Filter alignment if requested
+    aln_filename = input_args.aln
+    if input_args.out_aln:
+        with open(input_args.aln, "r") as aln_file, \
+                open(input_args.out_aln, "w") as out_aln_file:
+            for line in aln_file:
+                if re.search("^>", line):
+                    name = line.replace("#","_").replace(":","_").replace(">","").rstrip().split()
+                    out_aln_file.write('>' + name[0] + '\n')
+                else:
+                    sequence = line.upper().rstrip()
+                    sequence = re.sub('[^ACGTN-]','N',sequence)
+                    out_aln_file.write(sequence + '\n')
+        aln_filename = input_args.out_aln
 
+    # Going to use counter in a first pass to store sequence counts and the range of sequences
     total_base_counts = []
     iso_data = []
     total_headers = []
     tot_length_str = str(tot_lines)
     print("Running through alignment file: %s" % input_args.aln)
     print()
-    with open(input_args.aln, "r") as aln_file:
+    with open(aln_filename, "r") as aln_file:
         for index,line in enumerate(aln_file):
             num_zeros = len(tot_length_str) - len(str(index + 1))
             fmt_index = (("0" * num_zeros) + str(index + 1))
@@ -78,6 +100,7 @@ def main(input_args):
     ## Second pass to line up all the counts across the isolates 
     print("")
     print("Assessing counts...")
+    total_headers.sort()
     isolate_bases = []
     for count in total_base_counts:
         iso_row = []
@@ -88,7 +111,7 @@ def main(input_args):
     total_headers.insert(0, "isolate")
     write_out_headers = [re.sub("-","gap",i) for i in total_headers]
     print("Writing out results...")
-    with open((input_args.out + ".csv"), "w") as output:
+    with open((input_args.out), "w") as output:
         output.write(",".join(write_out_headers) + "\n")
         for i, aln_row in enumerate(isolate_bases):
             aln_row_str = list(map(str, aln_row))