Skip to content

Commit

Permalink
Merge pull request #367 from nickjcroucher/alignment_updating
Browse files Browse the repository at this point in the history
Update alignment generation methods
  • Loading branch information
nickjcroucher authored May 5, 2023
2 parents f7ef47e + c2d0ac0 commit 2c3c7c2
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 87 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.3
3.3.0
6 changes: 3 additions & 3 deletions docs/gubbins_manual.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ These will automatically be installed within the conda environment. Please cite

The required input file for Gubbins is a whole genome FASTA alignment. Each sequence should have a unique identifier, and special characters should be avoided. The sequences should only use the characters `ACGT` (DNA bases), `N` (unknown base) or `-` (alignment gap). If a starting tree is to be included, then this should be a Newick format.

The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA](https://github.com/simonrharris/SKA), which can be installed through `conda install -c bioconda ska` (SKA is included when installing Gubbins through conda). For instance,
The alignment is most easily generated through mapping sequences against a reference sequence. This can be achieved with the popular mapping software Snippy, following the instructions on the relevant [Github repository](https://github.com/tseemann/snippy). Alternatively, the alignment can be generated using the Gubbins script `generate_ska_alignment.py`, which creates an alignment using [SKA2](https://github.com/bacpop/ska.rust), which can be installed through `conda install -c bioconda ska2` (SKA2 is included when installing Gubbins through conda). For instance,

```
generate_ska_alignment.py --reference seq_X.fa --fasta fasta_files.list --fastq fastq_files.list --out out.aln
generate_ska_alignment.py --reference seq_X.fa --input input.list --out out.aln
```

Where `fasta_files.list` is a two column tab-delimited file containing sequence names (in the first column) and FASTA sequence assembly file paths (in the second column); `fastq_files.list` contains the same for unassembled FASTQ-format read data.
Where `input.list` is a tab-delimited file with one row per isolate. The first column should be the isolate name, and the subsequent entries on the same row should contain the corresponding sequence data - this may be a single FASTA assembly, or multiple FASTQ raw read files. The alignment will be reformatted for Gubbins (e.g. modifying isolate names and removing non-N IUPAC ambiguity codes) by this script.

The alignment can then be analysed with Gubbins:

Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ dependencies:
- raxml-ng=1.0.1
- fasttree=2.1.10
# Scripts
- ska
- ska2
8 changes: 8 additions & 0 deletions python/gubbins/tests/data/corrected_alignment.aln
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>sequence1
AAANNNNAAA
>sequence2
CCCCCCCCCC
>sequence3
GGGGGGGGGG
>sequence4
TTTTTTTTTT
5 changes: 5 additions & 0 deletions python/gubbins/tests/data/corrected_alignment.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
isolate,A,C,G,N,T
sequence1,6,0,0,4,0
sequence2,0,10,0,0,0
sequence3,0,0,10,0,0
sequence4,0,0,0,0,10
8 changes: 8 additions & 0 deletions python/gubbins/tests/data/invalid_alignment.aln
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>sequence1
AAARRRRAAA
>sequence2
CCCCCCCCCC
>sequence3
GGGGGGGGGG
>sequence4
TTTTTTTTTT
23 changes: 17 additions & 6 deletions python/gubbins/tests/test_python_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,28 @@

class TestPythonScripts(unittest.TestCase):

## Test the alignment_checker script

## Test the alignment_checker script
def test_alignment_checker(self):
small_aln = os.path.join(data_dir, "valid_alignment.aln")
output_file = os.path.join(working_dir, "valid_alignment_test")
output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
test_csv = os.path.join(data_dir, "test_valid_output.csv")
aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_file
aln_cmd = "gubbins_alignment_checker.py --aln " + small_aln + " --out " + output_csv
subprocess.check_call(aln_cmd, shell=True)
assert self.md5_check(output_csv, test_csv)
os.remove(output_csv)

def test_alignment_reformatting(self):
invalid_aln = os.path.join(data_dir, "invalid_alignment.aln")
output_file = os.path.join(working_dir, "invalid_alignment_test.aln")
output_csv = os.path.join(working_dir, "valid_alignment_test.csv")
test_aln = os.path.join(data_dir, "corrected_alignment.aln")
test_csv = os.path.join(data_dir, "corrected_alignment.csv")
aln_cmd = "gubbins_alignment_checker.py --aln " + invalid_aln + " --out-aln " + output_file + " --out " + output_csv
subprocess.check_call(aln_cmd, shell=True)
assert self.md5_check(output_csv, test_csv)
assert self.md5_check(output_file, test_aln)
os.remove(output_csv)
os.remove(output_file)

## Test the clade extraction script
def test_clade_extraction(self):
Expand Down Expand Up @@ -82,9 +93,9 @@ def test_generate_ska_alignment(self):
ref_seq = os.path.join(preprocess_dir, 'sequence_t1.fasta')
aln_out = os.path.join(preprocess_dir, 'ska_test_aln.aln')
# Script name
ska_cmd = "generate_ska_alignment.py --fasta " + fasta_loc +\
ska_cmd = "generate_ska_alignment.py --input " + fasta_loc +\
" --reference " + ref_seq + " --out " + aln_out +\
" --k 6"
" --k 7"
subprocess.check_call(ska_cmd, shell=True)
## Now run gubbins on the aln and check all the output is produced
parser = run_gubbins.parse_input_args()
Expand Down
96 changes: 31 additions & 65 deletions python/scripts/generate_ska_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,16 @@
# command line parsing
def get_options():

parser = argparse.ArgumentParser(description='Generate a ska alignment from a list '
parser = argparse.ArgumentParser(description='Generate a ska2 alignment from a list '
'of assemblies',
prog='generate_ska_alignment')

# input options
parser.add_argument('--reference',
help = 'Name of reference sequence to use for alignment',
required = True)
parser.add_argument('--fasta',
help = 'Two column list of names and FASTA files to include in alignment',
default = None,
required = False)
parser.add_argument('--fastq',
help = 'Two/three column list of names and of FASTQ files to include in alignment',
parser.add_argument('--input',
help = 'List of sequence data; one row per isolate, with first column being the isolate name',
default = None,
required = False)
parser.add_argument('--out',
Expand All @@ -53,7 +49,7 @@ def get_options():
parser.add_argument('--k',
help = 'Split kmer size',
type = int,
default = 15)
default = 17)
parser.add_argument('--threads',
help = 'Number of threads to use',
type = int,
Expand Down Expand Up @@ -89,67 +85,37 @@ def ska_map_sequences(seq, k = None, ref = None):

# Check if ska is installed
if which('ska') is None:
sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"')
sys.stderr.write('ska2 cannot be found on PATH; install with "conda install ska2"')
sys.exit(1)

# Dictionary for sequence names
seq_names = {}
all_names = []

# Make split kmers from assemblies
fasta_names = []
if args.fasta is not None:
# Read in FASTA assemblies
with open(args.fasta,'r') as fasta_list:
for line in fasta_list.readlines():
info = line.strip().split()
if os.path.isfile(info[1]):
fasta_names.append(info[1])
seq_names[info[1]] = info[0]
all_names.append(info[0])
else:
sys.stderr.write('Unable to find file ' + info[1] + '\n')
# Sketch into split kmers
with Pool(processes = args.threads) as pool:
pool.map(partial(map_fasta_sequence,
k = args.k,
names = seq_names),
fasta_names)

# Make split kmers from FASTQs
fastq_names = []
if args.fastq is not None:
# Read in FASTQ reads
with open(args.fastq,'r') as fastq_list:
for line in fastq_list.readlines():
info = line.strip().split()
if os.path.isfile(info[1]) and os.path.isfile(info[2]):
fastq_names.append((info[1],info[2]))
seq_names[info[1]] = info[0]
all_names.append(info[0])
else:
sys.stderr.write('Unable to find files ' + info[1] + ' and ' + info[2] + '\n')
# Sketch into split kmers
with Pool(processes = args.threads) as pool:
return_codes = pool.map(partial(map_fastq_sequence,
k = args.k,
names = seq_names),
fastq_names)

# Map sequences
with Pool(processes = args.threads) as pool:
return_codes = pool.map(partial(ska_map_sequences,
k = args.k,
ref = args.reference),
all_names)

# Generate alignment
subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out,
# Check if k value is acceptable:
if (args.k % 2) == 0 or args.k < 5 or args.k > 63:
sys.stderr.write('k must be odd and between 5 and 63\n')
sys.exit(1)

# Build ska sketch
subprocess.check_output('ska build -o ' + args.out + ' -k ' + str(args.k) + \
' -f ' + args.input + ' --threads ' + str(args.threads),
shell = True)

# Run ska mapping
if os.path.exists(args.out + '.skf'):
tmp_aln = os.path.join(os.path.dirname(args.out), 'tmp.' + os.path.basename(args.out))
subprocess.check_output('ska map -o ' + tmp_aln + ' --threads ' + str(args.threads) + ' ' + \
args.reference + ' ' + args.out + '.skf',
shell = True)
else:
sys.stderr.write('ska building failed\n')
sys.exit(1)

# Clean alignment to prep for Gubbins
subprocess.check_output('gubbins_alignment_checker.py --aln ' + tmp_aln + ' --out-aln ' + args.out + \
' --out ' + args.out + '.csv',
shell = True)

sys.stderr.write("Completed generating alignment with ska2 (https://github.com/bacpop/ska.rust)\n")

# Clean up
if not args.no_cleanup:
subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]),
shell = True)
subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]),
subprocess.check_output('rm ' + args.out + '.skf ' + tmp_aln,
shell = True)
45 changes: 34 additions & 11 deletions python/scripts/gubbins_alignment_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,35 @@
#

import argparse
from asyncore import write
import re
from collections import Counter

def parse_input_args():

parser = argparse.ArgumentParser(
description='Script to output bases in an alignment by isolate',
description='Script to evaluate and reformat an alignment prior to Gubbins analysis',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--aln', '-a', dest="aln",
help='Multifasta alignment file', required=True)
parser.add_argument('--out', '-o',
parser.add_argument('--aln',
'-a',
dest="aln",
help='Multifasta alignment filename',
required=True)
parser.add_argument('--out-aln',
dest="out_aln",
help='Reformatted alignment filename',
default = None,
required=False)
parser.add_argument('--out',
'-o',
dest="out",
help="Out csv name for writing results too", required=True)
help='Output CSV filename',
required=True)

return parser.parse_args()

def main(input_args):

## Lets set up the csv
# Read the number of rows in the alignment
row_num = 0
tot_lines = 0
with open(input_args.aln, "r") as aln_file:
Expand All @@ -48,16 +57,29 @@ def main(input_args):
row_num += 1
tot_lines += 1

# Going to use counter in a first pass to store sequence counts and the range of sequences

# Filter alignment if requested
aln_filename = input_args.aln
if input_args.out_aln:
with open(input_args.aln, "r") as aln_file, \
open(input_args.out_aln, "w") as out_aln_file:
for line in aln_file:
if re.search("^>", line):
name = line.replace("#","_").replace(":","_").replace(">","").rstrip().split()
out_aln_file.write('>' + name[0] + '\n')
else:
sequence = line.upper().rstrip()
sequence = re.sub('[^ACGTN-]','N',sequence)
out_aln_file.write(sequence + '\n')
aln_filename = input_args.out_aln

# Going to use counter in a first pass to store sequence counts and the range of sequences
total_base_counts = []
iso_data = []
total_headers = []
tot_length_str = str(tot_lines)
print("Running through alignment file: %s" % input_args.aln)
print()
with open(input_args.aln, "r") as aln_file:
with open(aln_filename, "r") as aln_file:
for index,line in enumerate(aln_file):
num_zeros = len(tot_length_str) - len(str(index + 1))
fmt_index = (("0" * num_zeros) + str(index + 1))
Expand All @@ -78,6 +100,7 @@ def main(input_args):
## Second pass to line up all the counts across the isolates
print("")
print("Assessing counts...")
total_headers.sort()
isolate_bases = []
for count in total_base_counts:
iso_row = []
Expand All @@ -88,7 +111,7 @@ def main(input_args):
total_headers.insert(0, "isolate")
write_out_headers = [re.sub("-","gap",i) for i in total_headers]
print("Writing out results...")
with open((input_args.out + ".csv"), "w") as output:
with open((input_args.out), "w") as output:
output.write(",".join(write_out_headers) + "\n")
for i, aln_row in enumerate(isolate_bases):
aln_row_str = list(map(str, aln_row))
Expand Down

0 comments on commit 2c3c7c2

Please sign in to comment.