Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/fpseq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .align import ParasailAlignment, align_seqs
from .align import SequenceAlignment, align_seqs
from .fpseq import FPSeq, from_fpbase
from .mutations import Mutation, MutationSet, get_mutations, mutate_sequence
from .skbio_protein import SkbSequence
Expand All @@ -8,7 +8,7 @@
"FPSeq",
"Mutation",
"MutationSet",
"ParasailAlignment",
"SequenceAlignment",
"SkbSequence",
"align_seqs",
"from_fpbase",
Expand Down
265 changes: 230 additions & 35 deletions backend/fpseq/align.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,78 @@
try:
from parasail import blosum62, nw_banded, nw_trace_scan_sat
except ImportError:
import warnings
"""Sequence alignment utilities.

This module provides functions for aligning protein sequences using
the Needleman-Wunsch global alignment algorithm with BLOSUM62 scoring matrix.
"""

from __future__ import annotations

from Bio import Align
from Bio.Align import substitution_matrices

warnings.warn("ERROR!!! could not import parasail... will not be able to align", stacklevel=2)
from .util import chunked_lines


def align_seqs(query, target, gop=5, gep=1, band_size=0):
"""basic parasail global alignment of two sequences
result is wrapped in ParasailAlignment Class"""
"""Perform global alignment of two sequences.

Uses the Needleman-Wunsch algorithm with BLOSUM62 scoring matrix
and affine gap penalties.

Parameters
----------
query : str
First sequence to align
target : str
Second sequence to align
gop : int, optional
Gap open penalty (default: 5)
gep : int, optional
Gap extension penalty (default: 1)
band_size : int, optional
Not currently used (kept for backwards compatibility)

Returns
-------
SequenceAlignment
Alignment result wrapped in SequenceAlignment class
"""
query = str(query)
target = str(target)
if band_size:
result = nw_banded(target, query, gop, gep, band_size, blosum62)
else:
result = nw_trace_scan_sat(target, query, gop, gep, blosum62)
return ParasailAlignment(result)

# Create aligner with BLOSUM62 and gap penalties
aligner = Align.PairwiseAligner()
aligner.mode = "global"
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
# BioPython uses negative scores for penalties
aligner.open_gap_score = -gop
aligner.extend_gap_score = -gep

# Perform alignment
alignments = aligner.align(target, query)
# Get the best alignment
alignment = alignments[0]

return SequenceAlignment(alignment, query, target)


def parental_numbering(aseq1, aseq2):
"""given two ALIGNED sequences, return a 'position list' for the second
sequence based on the parental sequence"""
"""Generate position numbering for aligned sequences.

Given two ALIGNED sequences, return a 'position list' for the second
sequence based on the parental (first) sequence.

Parameters
----------
aseq1 : str
Aligned parental sequence
aseq2 : str
Aligned child sequence

Returns
-------
list[str]
Position labels for each position in aseq2
"""
idx = 1
numlist = []
insertchars = "abcdefghijklmnopqrstuvwxyz"
Expand All @@ -40,21 +91,95 @@ def parental_numbering(aseq1, aseq2):
return numlist


class ParasailAlignment:
"""Convenience class to wrap the results of a parasail alignment"""
class SequenceAlignment:
"""Wrapper class for sequence alignment results.

This class provides a consistent interface for alignment results,
compatible with the previous parasail-based implementation.
"""

def __init__(self, result):
self.cigar = result.cigar.decode
if isinstance(self.cigar, bytes):
self.cigar = self.cigar.decode()
# confusing nomenclature is for consistency with scikit-bio
# where "query" is the initial sequence
self.target = result.query
self.query = result.ref
self.score = result.score
def __init__(self, alignment, query, target):
"""Initialize from BioPython Alignment object.

Parameters
----------
alignment : Bio.Align.Alignment
The BioPython alignment result
query : str
Original query sequence
target : str
Original target sequence
"""
self._alignment = alignment
# Store original sequences for compatibility
self.query = query
self.target = target
self.score = alignment.score
self._cigar = None
self._mutations = None

@property
def cigar(self):
"""Generate CIGAR string from alignment coordinates.

Returns
-------
str
CIGAR string representation of the alignment
"""
if self._cigar is not None:
return self._cigar

cigar_parts = []
coords = self._alignment.coordinates

# BioPython coordinates are [seq1_start, seq1_end] for each aligned region
# coords[0] is target, coords[1] is query
target_coords = coords[0]
query_coords = coords[1]

for i in range(len(target_coords) - 1):
target_len = target_coords[i + 1] - target_coords[i]
query_len = query_coords[i + 1] - query_coords[i]

if target_len == query_len:
# Match/mismatch region
if target_len > 0:
# Use '=' for match region
# (Could be split into = for match and X for mismatch in future)
cigar_parts.append(f"{target_len}=")
elif target_len > query_len:
# Deletion from query (or insertion in target)
cigar_parts.append(f"{target_len}D")
else:
# Insertion in query (or deletion from target)
cigar_parts.append(f"{query_len}I")

self._cigar = "".join(cigar_parts)
return self._cigar

@property
def cigar_tuple(self):
"""Get CIGAR as list of (length, operation) tuples.

Returns
-------
list[tuple[int, str]]
List of (length, operation) tuples
"""
if hasattr(self, "_cigar_tuple"):
return self._cigar_tuple
self._cigar_tuple = self._tuples_from_cigar()
return self._cigar_tuple

def _tuples_from_cigar(self):
"""Convert CIGAR string to list of tuples.

Returns
-------
list[tuple[int, str]]
List of (length, operation) tuples
"""
tuples = []
length_stack = []
for character in self.cigar:
Expand All @@ -65,17 +190,12 @@ def _tuples_from_cigar(self):
length_stack = []
return tuples

@property
def cigar_tuple(self):
if hasattr(self, "_cigar_tuple"):
return self._cigar_tuple
self._cigar_tuple = self._tuples_from_cigar()
return self._cigar_tuple

def __repr__(self):
"""String representation."""
return str(self)

def __str__(self):
"""Pretty-print alignment with match indicators."""
a = chunked_lines(self.aligned_target_sequence(), spacer="")
b = chunked_lines(self.aligned_query_sequence(), spacer="")
out = []
Expand All @@ -86,10 +206,23 @@ def __str__(self):
return "\n".join(out)

def __iter__(self):
"""Iterate over aligned sequences."""
yield self.aligned_query_sequence()
yield self.aligned_target_sequence()

def as_mutations(self, reference=None):
"""Convert alignment to mutation set.

Parameters
----------
reference : str, optional
Reference sequence for numbering

Returns
-------
MutationSet
Set of mutations between sequences
"""
from .mutations import MutationSet, _get_aligned_muts

seq1, seq2 = self
Expand All @@ -99,16 +232,58 @@ def as_mutations(self, reference=None):
return MutationSet(mutstring)

def print_alignment(self, max_length=80):
"""Print alignment to stdout.

Parameters
----------
max_length : int, optional
Maximum line length (not currently used)
"""
print(self.aligned_query_sequence() + "\n" + self.aligned_target_sequence())

def aligned_query_sequence(self):
return self._get_aligned_sequence(self.query, "I")
"""Get aligned query sequence with gaps.

Returns
-------
str
Query sequence with gaps inserted
"""
# BioPython alignment format returns aligned sequences
# alignment[0] is target (first arg to align()), alignment[1] is query
return str(self._alignment[1])

def aligned_target_sequence(self):
return self._get_aligned_sequence(self.target, "D")
"""Get aligned target sequence with gaps.

Returns
-------
str
Target sequence with gaps inserted
"""
# BioPython alignment format returns aligned sequences
# alignment[0] is target (first arg to align()), alignment[1] is query
return str(self._alignment[0])

def _get_aligned_sequence(self, seq, gap_type, gap_char="-", eq_char="="):
# assume zero based
"""Build aligned sequence from CIGAR string.

Parameters
----------
seq : str
Original sequence
gap_type : str
'D' for query sequence, 'I' for target sequence
gap_char : str, optional
Character to use for gaps (default: '-')
eq_char : str, optional
Character representing matches in CIGAR (default: '=')

Returns
-------
str
Aligned sequence with gaps
"""
# gap_type is 'D' when returning aligned query sequence
# gap_type is 'I' when returning aligned target sequence
aligned_sequence = ""
Expand All @@ -128,4 +303,24 @@ def _get_aligned_sequence(self, seq, gap_type, gap_char="-", eq_char="="):

@classmethod
def from_seqs(cls, query, target, **kwargs):
"""Create alignment from two sequences.

Parameters
----------
query : str
First sequence
target : str
Second sequence
**kwargs
Additional arguments passed to align_seqs

Returns
-------
SequenceAlignment
Alignment result
"""
return align_seqs(query, target, **kwargs)


# Backwards compatibility alias
ParasailAlignment = SequenceAlignment
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ dependencies = [
'maxminddb>=2.5.1',
'numpy>=1.26.3',
'pandas>=2.0.3',
'parasail==1.3.4',
'Pillow>=10.0.0',
'psycopg2>=2.9.11; platform_system == "Linux"',
'python-slugify>=8.0.1',
Expand Down
18 changes: 0 additions & 18 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.