From 5f01b031aa04cd5f18ece95cbb3af8526febc542 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 May 2023 17:08:00 +0100
Subject: [PATCH 01/13] Change alignment checker output file names

---
 python/gubbins/tests/test_python_scripts.py | 2 +-
 python/scripts/gubbins_alignment_checker.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
index ecddc9f6..f129623d 100644
--- a/python/gubbins/tests/test_python_scripts.py
+++ b/python/gubbins/tests/test_python_scripts.py
@@ -23,7 +23,7 @@ class TestPythonScripts(unittest.TestCase):
 
     def test_alignment_checker(self):
         small_aln = os.path.join(data_dir, "valid_alignment.aln")
-        output_file = os.path.join(working_dir, "valid_alignment_test")
+        output_file = os.path.join(working_dir, "valid_alignment_test.csv")
         output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
         test_csv = os.path.join(data_dir, "test_valid_output.csv")
         aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file
diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py
index d0d65807..28851614 100755
--- a/python/scripts/gubbins_alignment_checker.py
+++ b/python/scripts/gubbins_alignment_checker.py
@@ -31,6 +31,8 @@ def parse_input_args():
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--aln', '-a', dest="aln",
                         help='Multifasta alignment file', required=True)
+    parser.add_argument('--out-aln', '-a', dest="aln",
+                        help='Reformatted alignment file', required=False)
     parser.add_argument('--out', '-o',
                         dest="out",
                         help="Out csv name for writing results too", required=True)
@@ -49,8 +51,6 @@ def main(input_args):
             tot_lines += 1
 
     # Going to use counter in a first pass to store sequence counts and the range of sequences
-    
-
     total_base_counts = []
     iso_data = []
     total_headers = []
@@ -88,7 +88,7 @@ def main(input_args):
     total_headers.insert(0, "isolate")
     write_out_headers = [re.sub("-","gap",i) for i in total_headers]
     print("Writing out results...")
-    with open((input_args.out + ".csv"), "w") as output:
+    with open((input_args.out), "w") as output:
         output.write(",".join(write_out_headers) + "\n")
         for i, aln_row in enumerate(isolate_bases):
             aln_row_str = list(map(str, aln_row))

From 425fc4383632eb1c65c7c90c1ef5645ca0275b37 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 May 2023 17:16:13 +0100
Subject: [PATCH 02/13] Update command line parsing

---
 python/scripts/gubbins_alignment_checker.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py
index 28851614..0e1b135c 100755
--- a/python/scripts/gubbins_alignment_checker.py
+++ b/python/scripts/gubbins_alignment_checker.py
@@ -27,15 +27,22 @@
 def parse_input_args():
 
     parser = argparse.ArgumentParser(
-        description='Script to output bases in an alignment by isolate',
+        description='Script to evaluate and reformat an alignment prior to Gubbins analysis',
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--aln', '-a', dest="aln",
-                        help='Multifasta alignment file', required=True)
-    parser.add_argument('--out-aln', '-a', dest="aln",
-                        help='Reformatted alignment file', required=False)
-    parser.add_argument('--out', '-o',
+    parser.add_argument('--aln',
+                        '-a',
+                        dest="aln",
+                        help='Multifasta alignment filename',
+                        required=True)
+    parser.add_argument('--out-aln',
+                        dest="out_aln",
+                        help='Reformatted alignment filename',
+                        required=False)
+    parser.add_argument('--out',
+                        '-o',
                         dest="out",
-                        help="Out csv name for writing results too", required=True)
+                        help='Output CSV filename',
+                        required=True)
 
     return parser.parse_args()
 

From 52ad7dd7a339770628b4201112d8bb1db87ee867 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 May 2023 17:47:03 +0100
Subject: [PATCH 03/13] Enable reformatting of alignments

---
 python/scripts/gubbins_alignment_checker.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py
index 0e1b135c..76910d42 100755
--- a/python/scripts/gubbins_alignment_checker.py
+++ b/python/scripts/gubbins_alignment_checker.py
@@ -37,6 +37,7 @@ def parse_input_args():
     parser.add_argument('--out-aln',
                         dest="out_aln",
                         help='Reformatted alignment filename',
+                        default = None,
                         required=False)
     parser.add_argument('--out',
                         '-o',
@@ -48,7 +49,7 @@ def parse_input_args():
 
 def main(input_args):
 
-    ## Lets set up the csv
+    # Read the number of rows in the alignment
     row_num = 0
     tot_lines = 0
     with open(input_args.aln, "r") as aln_file:
@@ -57,6 +58,21 @@ def main(input_args):
                 row_num += 1
             tot_lines += 1
 
+    # Filter alignment if requested
+    aln_filename = input_args.aln
+    if input_args.out_aln:
+        with open(input_args.aln, "r") as aln_file, \
+                open(input_args.out_aln, "w") as out_aln_file:
+            for line in aln_file:
+                if re.search("^>", line):
+                    name = line.replace("#","_").replace(":","_").replace(">","").rstrip().split()
+                    out_aln_file.write('>' + name[0] + '\n')
+                else:
+                    sequence = line.upper().rstrip()
+                    sequence = re.sub('[^ACGTN-]','N',sequence)
+                    out_aln_file.write(sequence + '\n')
+        aln_filename = input_args.out_aln
+
     # Going to use counter in a first pass to store sequence counts and the range of sequences
     total_base_counts = []
     iso_data = []
@@ -64,7 +80,7 @@ def main(input_args):
     tot_length_str = str(tot_lines)
     print("Running through alignment file: %s" % input_args.aln)
     print()
-    with open(input_args.aln, "r") as aln_file:
+    with open(aln_filename, "r") as aln_file:
         for index,line in enumerate(aln_file):
             num_zeros = len(tot_length_str) - len(str(index + 1))
             fmt_index = (("0" * num_zeros) + str(index + 1))

From 6999fbf768d7f81aca68b91c258ccd46f99d565f Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 May 2023 18:02:16 +0100
Subject: [PATCH 04/13] Add test for alignment reformatting

---
 .../gubbins/tests/data/corrected_alignment.aln   |  8 ++++++++
 .../gubbins/tests/data/corrected_alignment.csv   |  5 +++++
 python/gubbins/tests/data/invalid_alignment.aln  |  8 ++++++++
 python/gubbins/tests/test_python_scripts.py      | 16 ++++++++++++++--
 4 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 python/gubbins/tests/data/corrected_alignment.aln
 create mode 100644 python/gubbins/tests/data/corrected_alignment.csv
 create mode 100644 python/gubbins/tests/data/invalid_alignment.aln

diff --git a/python/gubbins/tests/data/corrected_alignment.aln b/python/gubbins/tests/data/corrected_alignment.aln
new file mode 100644
index 00000000..fafdb3be
--- /dev/null
+++ b/python/gubbins/tests/data/corrected_alignment.aln
@@ -0,0 +1,8 @@
+>sequence1
+AAANNNNAAA
+>sequence2
+CCCCCCCCCC
+>sequence3
+GGGGGGGGGG
+>sequence4
+TTTTTTTTTT
diff --git a/python/gubbins/tests/data/corrected_alignment.csv b/python/gubbins/tests/data/corrected_alignment.csv
new file mode 100644
index 00000000..91e775f1
--- /dev/null
+++ b/python/gubbins/tests/data/corrected_alignment.csv
@@ -0,0 +1,5 @@
+isolate,A,N,C,G,T
+sequence1,6,4,0,0,0
+sequence2,0,0,10,0,0
+sequence3,0,0,0,10,0
+sequence4,0,0,0,0,10
diff --git a/python/gubbins/tests/data/invalid_alignment.aln b/python/gubbins/tests/data/invalid_alignment.aln
new file mode 100644
index 00000000..022422cf
--- /dev/null
+++ b/python/gubbins/tests/data/invalid_alignment.aln
@@ -0,0 +1,8 @@
+>sequence1
+AAARRRRAAA
+>sequence2
+CCCCCCCCCC
+>sequence3
+GGGGGGGGGG
+>sequence4
+TTTTTTTTTT
diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
index f129623d..889b6d44 100644
--- a/python/gubbins/tests/test_python_scripts.py
+++ b/python/gubbins/tests/test_python_scripts.py
@@ -19,8 +19,7 @@
 
 class TestPythonScripts(unittest.TestCase):
 
-    ## Test the alignment_checker script 
-
+    ## Test the alignment_checker script
     def test_alignment_checker(self):
         small_aln = os.path.join(data_dir, "valid_alignment.aln")
         output_file = os.path.join(working_dir, "valid_alignment_test.csv")
@@ -31,6 +30,19 @@ def test_alignment_checker(self):
         assert self.md5_check(output_csv, test_csv)
         os.remove(output_csv)
 
+    def test_alignment_reformatting(self):
+        invalid_aln = os.path.join(data_dir, "invalid_alignment.aln")
+        output_file = os.path.join(working_dir, "invalid_alignment_test.aln")
+        output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
+        test_aln = os.path.join(data_dir, "corrected_alignment.aln")
+        test_csv = os.path.join(data_dir, "corrected_alignment.csv")
+        aln_cmd = "gubbins_alignment_checker.py --aln " + invalid_aln + " --out-aln " + output_file + " --out " + output_csv
+        subprocess.check_call(aln_cmd, shell=True)
+        assert self.md5_check(output_csv, test_csv)
+        assert self.md5_check(output_file, test_aln)
+        os.remove(output_csv)
+        os.remove(output_file)
+
     ## Test the clade extraction script
     def test_clade_extraction(self):
         multiple_aln = os.path.join(data_dir, "multiple_recombinations.aln")

From 31c80fe8b4c30ff221300a14d804e34ed2625c4a Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Thu, 4 May 2023 21:45:37 +0100
Subject: [PATCH 05/13] Upgrade to ska2

---
 python/scripts/generate_ska_alignment.py | 75 ++++--------------------
 1 file changed, 13 insertions(+), 62 deletions(-)

diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py
index 73a6ab36..97182283 100755
--- a/python/scripts/generate_ska_alignment.py
+++ b/python/scripts/generate_ska_alignment.py
@@ -31,7 +31,7 @@
 # command line parsing
 def get_options():
 
-    parser = argparse.ArgumentParser(description='Generate a ska alignment from a list '
+    parser = argparse.ArgumentParser(description='Generate a ska2 alignment from a list '
                                                     'of assemblies',
                                      prog='generate_ska_alignment')
 
@@ -39,12 +39,8 @@ def get_options():
     parser.add_argument('--reference',
                         help = 'Name of reference sequence to use for alignment',
                         required = True)
-    parser.add_argument('--fasta',
-                        help = 'Two column list of names and FASTA files to include in alignment',
-                        default = None,
-                        required = False)
-    parser.add_argument('--fastq',
-                        help = 'Two/three column list of names and of FASTQ files to include in alignment',
+    parser.add_argument('--input',
+                        help = 'List of sequence data; one row per isolate, with first column being the isolate's name',
                         default = None,
                         required = False)
     parser.add_argument('--out',
@@ -53,7 +49,7 @@ def get_options():
     parser.add_argument('--k',
                         help = 'Split kmer size',
                         type = int,
-                        default = 15)
+                        default = 17)
     parser.add_argument('--threads',
                         help = 'Number of threads to use',
                         type = int,
@@ -89,62 +85,17 @@ def ska_map_sequences(seq, k = None, ref = None):
 
     # Check if ska is installed
     if which('ska') is None:
-        sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"')
+        sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2")
         sys.exit(1)
 
-    # Dictionary for sequence names
-    seq_names = {}
-    all_names = []
-
-    # Make split kmers from assemblies
-    fasta_names = []
-    if args.fasta is not None:
-        # Read in FASTA assemblies
-        with open(args.fasta,'r') as fasta_list:
-            for line in fasta_list.readlines():
-                info = line.strip().split()
-                if os.path.isfile(info[1]):
-                    fasta_names.append(info[1])
-                    seq_names[info[1]] = info[0]
-                    all_names.append(info[0])
-                else:
-                    sys.stderr.write('Unable to find file ' + info[1] + '\n')
-        # Sketch into split kmers
-        with Pool(processes = args.threads) as pool:
-            pool.map(partial(map_fasta_sequence,
-                                k = args.k,
-                                names = seq_names),
-                                fasta_names)
-    
-    # Make split kmers from FASTQs
-    fastq_names = []
-    if args.fastq is not None:
-        # Read in FASTQ reads
-        with open(args.fastq,'r') as fastq_list:
-            for line in fastq_list.readlines():
-                info = line.strip().split()
-                if os.path.isfile(info[1]) and os.path.isfile(info[2]):
-                    fastq_names.append((info[1],info[2]))
-                    seq_names[info[1]] = info[0]
-                    all_names.append(info[0])
-                else:
-                    sys.stderr.write('Unable to find files ' + info[1] + ' and ' + info[2] + '\n')
-        # Sketch into split kmers
-        with Pool(processes = args.threads) as pool:
-            return_codes = pool.map(partial(map_fastq_sequence,
-                                            k = args.k,
-                                            names = seq_names),
-                                            fastq_names)
-
-    # Map sequences
-    with Pool(processes = args.threads) as pool:
-        return_codes = pool.map(partial(ska_map_sequences,
-                                        k = args.k,
-                                        ref = args.reference),
-                                        all_names)
-
-    # Generate alignment
-    subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out,
+    # Build ska sketch
+    subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(k) + \
+                            ' -f ' + args.input + ' --threads ' + str(args.threads),
+                            shell = True)
+
+    # Run ska mapping
+    subprocess.check_output('ska map -o tmp.' + args.out + ' -r ' + ref + \
+                             ' --threads ' + str(args.threads) + ' ' + args.out + '.skf',
                             shell = True)
 
     # Clean up

From 1069d03bd52bbacd6f538340d2769eefdccfdce7 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 05:23:47 +0100
Subject: [PATCH 06/13] Update ska alignment testing script

---
 python/gubbins/tests/test_python_scripts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
index 889b6d44..8baead2d 100644
--- a/python/gubbins/tests/test_python_scripts.py
+++ b/python/gubbins/tests/test_python_scripts.py
@@ -94,7 +94,7 @@ def test_generate_ska_alignment(self):
         ref_seq = os.path.join(preprocess_dir, 'sequence_t1.fasta')
         aln_out = os.path.join(preprocess_dir, 'ska_test_aln.aln')
         # Script name
-        ska_cmd = "generate_ska_alignment.py --fasta " + fasta_loc +\
+        ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\
             " --reference " + ref_seq + " --out " + aln_out +\
                 " --k 6"
         subprocess.check_call(ska_cmd, shell=True)

From 12747bdd00e9c697136a4418eff555abba745cf1 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 06:11:55 +0100
Subject: [PATCH 07/13] Update alignment generation and formatting

---
 python/scripts/generate_ska_alignment.py    | 31 +++++++++++++++------
 python/scripts/gubbins_alignment_checker.py |  1 -
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/python/scripts/generate_ska_alignment.py b/python/scripts/generate_ska_alignment.py
index 97182283..7c5ba35e 100755
--- a/python/scripts/generate_ska_alignment.py
+++ b/python/scripts/generate_ska_alignment.py
@@ -40,7 +40,7 @@ def get_options():
                         help = 'Name of reference sequence to use for alignment',
                         required = True)
     parser.add_argument('--input',
-                        help = 'List of sequence data; one row per isolate, with first column being the isolate's name',
+                        help = 'List of sequence data; one row per isolate, with first column being the isolate name',
                         default = None,
                         required = False)
     parser.add_argument('--out',
@@ -85,22 +85,37 @@ def ska_map_sequences(seq, k = None, ref = None):
 
     # Check if ska is installed
     if which('ska') is None:
-        sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2")
+        sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2"')
+        sys.exit(1)
+
+    # Check if k value is acceptable:
+    if (args.k % 2) == 0 or args.k < 5 or args.k > 63:
+        sys.stderr.write('k must be odd and between 5 and 63\n')
         sys.exit(1)
 
     # Build ska sketch
-    subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(k) + \
+    subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(args.k) + \
                             ' -f ' + args.input + ' --threads ' + str(args.threads),
                             shell = True)
 
     # Run ska mapping
-    subprocess.check_output('ska map -o tmp.' + args.out + ' -r ' + ref + \
-                             ' --threads ' + str(args.threads) + ' ' + args.out + '.skf',
+    if os.path.exists(args.out + '.skf'):
+        tmp_aln = os.path.join(os.path.dirname(args.out), 'tmp.' + os.path.basename(args.out))
+        subprocess.check_output('ska map -o ' + tmp_aln + ' --threads ' + str(args.threads) + ' ' +  \
+                                    args.reference + ' ' + args.out + '.skf',
+                                shell = True)
+    else:
+        sys.stderr.write('ska building failed\n')
+        sys.exit(1)
+                            
+    # Clean alignment to prep for Gubbins
+    subprocess.check_output('gubbins_alignment_checker.py --aln ' + tmp_aln + ' --out-aln '  + args.out + \
+                             ' --out ' + args.out + '.csv',
                             shell = True)
+    
+    sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n")
 
     # Clean up
     if not args.no_cleanup:
-        subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]),
-                                shell = True)
-        subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]),
+        subprocess.check_output('rm ' + args.out + '.skf ' + tmp_aln,
                                 shell = True)
diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py
index 76910d42..226f3907 100755
--- a/python/scripts/gubbins_alignment_checker.py
+++ b/python/scripts/gubbins_alignment_checker.py
@@ -20,7 +20,6 @@
 #
 
 import argparse
-from asyncore import write
 import re
 from collections import Counter
 

From a489caf9e1a281c9dc9ae088f9240e18db51b536 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 06:32:59 +0100
Subject: [PATCH 08/13] Change k-mer length

---
 python/gubbins/tests/test_python_scripts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
index 8baead2d..3c14bbc4 100644
--- a/python/gubbins/tests/test_python_scripts.py
+++ b/python/gubbins/tests/test_python_scripts.py
@@ -96,7 +96,7 @@ def test_generate_ska_alignment(self):
         # Script name
         ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\
             " --reference " + ref_seq + " --out " + aln_out +\
-                " --k 6"
+                " --k 7"
         subprocess.check_call(ska_cmd, shell=True)
         ## Now run gubbins on the aln and check all the output is produced 
         parser = run_gubbins.parse_input_args()

From d066650c080caff10ceb3aaf9eba3eab34246418 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 06:37:05 +0100
Subject: [PATCH 09/13] Update versions for alignment generation

---
 VERSION         | 2 +-
 environment.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index eb39e538..15a27998 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.3
+3.3.0
diff --git a/environment.yml b/environment.yml
index 758f8371..370369a1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -34,4 +34,4 @@ dependencies:
   - raxml-ng=1.0.1
   - fasttree=2.1.10
 # Scripts
-  - ska
+  - ska2

From c7adbb70ddc8302a5b91281a91e606e896a09eb0 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 07:07:10 +0100
Subject: [PATCH 10/13] Rearrange alignment summary output file

---
 python/gubbins/tests/data/corrected_alignment.csv | 8 ++++----
 python/gubbins/tests/test_python_scripts.py       | 5 ++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/gubbins/tests/data/corrected_alignment.csv b/python/gubbins/tests/data/corrected_alignment.csv
index 91e775f1..c8c5eb42 100644
--- a/python/gubbins/tests/data/corrected_alignment.csv
+++ b/python/gubbins/tests/data/corrected_alignment.csv
@@ -1,5 +1,5 @@
-isolate,A,N,C,G,T
-sequence1,6,4,0,0,0
-sequence2,0,0,10,0,0
-sequence3,0,0,0,10,0
+isolate,A,C,G,N,T
+sequence1,6,0,0,4,0
+sequence2,0,10,0,0,0
+sequence3,0,0,10,0,0
 sequence4,0,0,0,0,10
diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
index 3c14bbc4..62e3fe0e 100644
--- a/python/gubbins/tests/test_python_scripts.py
+++ b/python/gubbins/tests/test_python_scripts.py
@@ -22,12 +22,11 @@ class TestPythonScripts(unittest.TestCase):
     ## Test the alignment_checker script
     def test_alignment_checker(self):
         small_aln = os.path.join(data_dir, "valid_alignment.aln")
-        output_file = os.path.join(working_dir, "valid_alignment_test.csv")
         output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
         test_csv = os.path.join(data_dir, "test_valid_output.csv")
-        aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file
+        aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv
         subprocess.check_call(aln_cmd, shell=True)
-        assert self.md5_check(output_csv, test_csv)
+        assert self.md5_check(output_file, test_csv)
         os.remove(output_csv)
 
     def test_alignment_reformatting(self):

From 5434e031f58e89a1d8ce4605cbcb137d74cbe64d Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 07:16:29 +0100
Subject: [PATCH 11/13] Fix filename variable

---
 python/gubbins/tests/test_python_scripts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/gubbins/tests/test_python_scripts.py b/python/gubbins/tests/test_python_scripts.py
index 62e3fe0e..7a1724e1 100644
--- a/python/gubbins/tests/test_python_scripts.py
+++ b/python/gubbins/tests/test_python_scripts.py
@@ -26,7 +26,7 @@ def test_alignment_checker(self):
         test_csv = os.path.join(data_dir, "test_valid_output.csv")
         aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv
         subprocess.check_call(aln_cmd, shell=True)
-        assert self.md5_check(output_file, test_csv)
+        assert self.md5_check(output_csv, test_csv)
         os.remove(output_csv)
 
     def test_alignment_reformatting(self):

From e02866ed85da12435101155e1d794e3751331184 Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 07:41:47 +0100
Subject: [PATCH 12/13] Sort CSV headers

---
 python/scripts/gubbins_alignment_checker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/scripts/gubbins_alignment_checker.py b/python/scripts/gubbins_alignment_checker.py
index 226f3907..1257a6d8 100755
--- a/python/scripts/gubbins_alignment_checker.py
+++ b/python/scripts/gubbins_alignment_checker.py
@@ -100,6 +100,7 @@ def main(input_args):
     ## Second pass to line up all the counts across the isolates 
     print("")
     print("Assessing counts...")
+    total_headers.sort()
     isolate_bases = []
     for count in total_base_counts:
         iso_row = []

From c2d0ac0da9b4eee86ab099423c148f1541152a1c Mon Sep 17 00:00:00 2001
From: Nick Croucher <n.croucher@imperial.ac.uk>
Date: Fri, 5 May 2023 07:56:28 +0100
Subject: [PATCH 13/13] Update for new alignment generation

---
 docs/gubbins_manual.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/gubbins_manual.md b/docs/gubbins_manual.md
index 31ecd47e..e3fd739a 100644
--- a/docs/gubbins_manual.md
+++ b/docs/gubbins_manual.md
@@ -52,13 +52,13 @@ These will automatically be installed within the conda environment. Please cite
 
 The required input file for Gubbins is a whole genome FASTA alignment. Each sequence should have a unique identifier, and special characters should be avoided. The sequences should only use the characters `ACGT` (DNA bases), `N` (unknown base) or `-` (alignment gap). If a starting tree is to be included, then this should be a Newick format.
 
-The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA](https://github.com/simonrharris/SKA), which can be installed through `conda install -c bioconda ska` (SKA is included when installing Gubbins through conda). For instance,
+The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA2](https://github.com/bacpop/ska.rust), which can be installed through `conda install -c bioconda ska2` (SKA2 is included when installing Gubbins through conda). For instance,
 
 ```
-generate_ska_alignment.py --reference seq_X.fa --fasta fasta_files.list --fastq fastq_files.list --out out.aln
+generate_ska_alignment.py --reference seq_X.fa --input input.list --out out.aln
 ```
 
-Where `fasta_files.list` is a two column tab-delimited file containing sequence names (in the first column) and FASTA sequence assembly file paths (in the second column); `fastq_files.list` contains the same for unassembled FASTQ-format read data.
+Where `input.list` is a tab-delimited file with one row per isolate. The first column should be the isolate name, and the subsequent entries on the same row should contain the corresponding sequence data - this may be a single FASTA assembly, or multiple FASTQ raw read files. The alignment will be reformatted for Gubbins (e.g. modifying isolate names and removing non-N IUPAC ambiguity codes) by this script.
 
 The alignment can then be analysed with Gubbins: