Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,5 @@ dist-ssr
*.ntvs*
*.njsproj
*.sln
*.sw?
*.sw?
backend/src/services/remote/blast_service/align2.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this here? Remove it

286 changes: 128 additions & 158 deletions backend/src/services/remote/blast_service/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
from pathlib import Path
from typing import Optional, List, Dict, Any
from Bio.Blast import NCBIXML
from services.remote.database_service.xml_to_db import xml_to_db

from src.shared.constants import (
from shared.constants import (
PROGRAM_STORAGE_DIR_SHARED_BLAST,
PROGRAM_STORAGE_DIR_SHARED_DATA_FASTA,
)
Comment on lines +18 to 21
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from shared.constants import (
PROGRAM_STORAGE_DIR_SHARED_BLAST,
PROGRAM_STORAGE_DIR_SHARED_DATA_FASTA,
)
from shared import PROGRAM_STORAGE_DIR_SHARED_BLAST, PROGRAM_STORAGE_DIR_SHARED_DATA_FASTA

Expand Down Expand Up @@ -152,6 +153,8 @@ def blast_cmdline(
logger.error("`blastn` command not found. Is BLAST+ installed and in PATH?")
raise

xml_to_db(output_file)

return str(output_file)


Expand Down Expand Up @@ -202,176 +205,143 @@ def extract_chromosome(subject_id: str) -> str:
return "Unknown"


import os
import sqlite3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imports at the middle of a file? Relocate to the top of the file where imports should be

from typing import List, Dict, Any
Copy link
Member

@mantvydasdeltuva mantvydasdeltuva May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate from typing, remove it


BASE_OUTPUT_DIR = os.path.join(os.path.expanduser("~"), ".kath", "shared", "data", "blast_results")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use constants from constants.py

DB_FILENAME = "xml.db"
db_file = os.path.join(BASE_OUTPUT_DIR, DB_FILENAME)

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#Takes in xml file(db) and returns json file
def parse_blast_results(
result_file: str,
min_hsp_score: float = 50.0,
min_identity_perc: float = 95.0,
min_hsp_length: int = 50,
evalue_threshold: float = 1e-10,
) -> List[Dict[str, Any]]:
"""
Parse BLAST XML results, identify variations (substitutions, insertions, deletions),
Parse BLAST results from SQLite DB (latest file_id), identify variations,
and filter based on alignment quality metrics.

Args:
result_file: Path to BLAST XML output file.
min_hsp_score: Minimum bit score for an HSP to be considered.
min_identity_perc: Minimum percent identity for an HSP (0-100).
min_hsp_length: Minimum alignment length for an HSP.
evalue_threshold: Maximum E-value for an HSP.

Returns:
A list of dictionaries, each describing a detected variation.
"""
variations = []
logger.info(f"Parsing BLAST results from: {result_file}")
logger.info(f"Parsing BLAST results from database: {db_file}")

try:
with open(result_file, "r", encoding="utf-8") as result_handle:
blast_records = NCBIXML.parse(result_handle)

query_count = 0
total_hsps_processed = 0
total_variations_found = 0

for blast_record in blast_records:
query_count += 1
logger.debug(
f"Processing Query: {blast_record.query} ({blast_record.query_length} bp)"
)
if not blast_record.alignments:
logger.debug(
f" No significant alignments found for query {blast_record.query}"
)
continue

for alignment in blast_record.alignments:
subject_id = alignment.hit_id or alignment.title
chromosome = extract_chromosome(subject_id)
logger.debug(
f" Alignment Hit: {subject_id} (Length: {alignment.length} bp) -> Chromosome: {chromosome}"
)

hsp_index = 0
for hsp in alignment.hsps:
hsp_index += 1
total_hsps_processed += 1
percent_identity = (hsp.identities / hsp.align_length) * 100

# --- Quality Filtering ---
if (
hsp.expect > evalue_threshold
or hsp.bits < min_hsp_score
or percent_identity < min_identity_perc
or hsp.align_length < min_hsp_length
):
logger.debug(
f" Skipping HSP {hsp_index} (Score: {hsp.bits:.1f}, "
f"E-value: {hsp.expect:.2g}, Ident: {percent_identity:.1f}%, "
f"Len: {hsp.align_length}) due to quality filters."
)
continue

logger.debug(
f" Processing HSP {hsp_index} (Score: {hsp.bits:.1f}, "
f"E-value: {hsp.expect:.2g}, Ident: {percent_identity:.1f}%, "
f"Len: {hsp.align_length}, Strand: {hsp.strand})"
)

# --- Detailed Variation Extraction ---
query_seq = hsp.query
match_seq = hsp.match
sbjct_seq = hsp.sbjct

query_pos = hsp.query_start - 1 # 0-based index
sbjct_pos = hsp.sbjct_start - 1 # 0-based index

hsp_variations = 0
for i in range(hsp.align_length):
q_base = query_seq[i]
s_base = sbjct_seq[i]
m_char = match_seq[i]

q_pos_current = -1
if q_base != "-":
query_pos += 1
q_pos_current = query_pos

s_pos_current = -1
if s_base != "-":
sbjct_pos += 1
s_pos_current = sbjct_pos

# Identify variation type
variation_type = None
ref_allele = None
alt_allele = None

if q_base != s_base:
if q_base == "-":
variation_type = "insertion"
ref_allele = "-"
alt_allele = s_base
elif s_base == "-":
variation_type = "deletion"
ref_allele = q_base
alt_allele = "-"
else: # Substitution (Mismatch)
variation_type = "substitution"
ref_allele = s_base # Reference base from subject
alt_allele = q_base # Alternative base from query

if variation_type:
hsp_variations += 1
variation_details = {
"query_id": blast_record.query,
"subject_id": subject_id,
"chromosome": chromosome,
"position": (
s_pos_current + 1
if s_pos_current != -1
else None
),
"variation_type": variation_type,
"reference_allele": ref_allele,
"query_allele": alt_allele,
"query_position": (
q_pos_current + 1
if q_pos_current != -1
else None
),
"hsp_score": hsp.bits,
"hsp_evalue": hsp.expect,
"hsp_identity": percent_identity,
"hsp_align_length": hsp.align_length,
"hsp_query_start": hsp.query_start,
"hsp_subject_start": hsp.sbjct_start,
"hsp_strand": hsp.strand,
"hsp_gaps": hsp.gaps,
}
variations.append(variation_details)

if hsp_variations > 0:
logger.debug(
f" Found {hsp_variations} variations in this HSP."
)
total_variations_found += hsp_variations

logger.info(f"Processed {query_count} queries and {total_hsps_processed} HSPs.")
if variations:
logger.info(
f"Total variations found meeting quality criteria: {len(variations)}"
)
else:
logger.info("No variations found meeting quality criteria after filtering.")
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

except FileNotFoundError:
logger.error(f"BLAST result file not found: {result_file}")
return []
except Exception as e:
logger.exception(f"Error parsing BLAST XML file {result_file}: {e}")
# Get the latest file_id
cursor.execute("SELECT MAX(file_id) FROM hsps")
result = cursor.fetchone()
if not result or result[0] is None:
logger.warning("No file_id found in database.")
conn.close()
return []

latest_file_id = result[0]
logger.info(f"Using data from file_id: {latest_file_id}")

# Fetch all HSPs for the latest file_id
cursor.execute("""
SELECT
hit_id, hsp_num, bit_score, score, evalue, query_from, query_to,
hit_from, hit_to, query_frame, hit_frame, identity, positive, gaps,
align_len, qseq, hseq, midline
FROM hsps
WHERE file_id = ?
""", (latest_file_id,))
hsps = cursor.fetchall()

# Get column names for mapping
col_names = [desc[0] for desc in cursor.description]

for hsp_row in hsps:
hsp = dict(zip(col_names, hsp_row))
# Calculate percent identity
try:
percent_identity = (hsp["identity"] / hsp["align_len"]) * 100
except Exception:
percent_identity = 0

# --- Quality Filtering ---
if (
hsp["evalue"] > evalue_threshold
or hsp["bit_score"] < min_hsp_score
or percent_identity < min_identity_perc
or hsp["align_len"] < min_hsp_length
):
continue

# --- Detailed Variation Extraction ---
query_seq = hsp["qseq"]
sbjct_seq = hsp["hseq"]
match_seq = hsp["midline"]

query_pos = hsp["query_from"] - 1 # 0-based index
sbjct_pos = hsp["hit_from"] - 1 # 0-based index

for i in range(hsp["align_len"]):
q_base = query_seq[i]
s_base = sbjct_seq[i]
m_char = match_seq[i]

q_pos_current = -1
if q_base != "-":
query_pos += 1
q_pos_current = query_pos

s_pos_current = -1
if s_base != "-":
sbjct_pos += 1
s_pos_current = sbjct_pos

# Identify variation type
variation_type = None
ref_allele = None
alt_allele = None

if q_base != s_base:
if q_base == "-":
variation_type = "insertion"
ref_allele = "-"
alt_allele = s_base
elif s_base == "-":
variation_type = "deletion"
ref_allele = q_base
alt_allele = "-"
else: # Substitution (Mismatch)
variation_type = "substitution"
ref_allele = s_base
alt_allele = q_base

if variation_type:
variation_details = {
"query_id": None, # Not stored in DB, set to None or fetch if available
"subject_id": hsp["hit_id"],
"chromosome": None, # Not stored in DB, set to None or fetch if available
"position": (
s_pos_current + 1 if s_pos_current != -1 else None
),
"variation_type": variation_type,
"reference_allele": ref_allele,
"query_allele": alt_allele,
"query_position": (
q_pos_current + 1 if q_pos_current != -1 else None
),
"hsp_score": hsp["bit_score"],
"hsp_evalue": hsp["evalue"],
"hsp_identity": percent_identity,
"hsp_align_length": hsp["align_len"],
"hsp_query_start": hsp["query_from"],
"hsp_subject_start": hsp["hit_from"],
"hsp_strand": None, # Not stored in DB, set to None or fetch if available
"hsp_gaps": hsp["gaps"],
}
variations.append(variation_details)

conn.close()
logger.info(f"Total variations found meeting quality criteria: {len(variations)}")
return variations


Expand Down Expand Up @@ -422,7 +392,7 @@ def _run_blast_and_parse_single(

# --- Parse Results ---
try:
variations = parse_blast_results(blast_result_xml)
variations = parse_blast_results()

# --- Save Variations to JSON ---
# Always save a JSON file, even if empty, to indicate processing occurred
Expand Down
19 changes: 16 additions & 3 deletions backend/src/services/remote/blast_service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from .align import process_single_fasta
from .find import process_variants
from services.remote.database_service.json_to_db import json_to_db


class BlastService:
Expand All @@ -30,12 +31,24 @@ def disease_extraction(self, fasta_file: str):
try:
print(f"Processing file: {fasta_file}")
# Perform blast aligning
# +++++++++++++++++++++++++++++++++++++++++++++++++
# Takes in fasta file and returns json file
result_file = process_single_fasta(fasta_file)

if not result_file:
raise Exception("Failed to perform blast aligning")



#++++++++++++++++++++++++++++++++++++++++++++++++
# Takes in Json file and returns db file
db_file = json_to_db(result_file)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what for db_file is used? If you dont use for anything, dont apply for anything


if not db_file:
raise Exception("Failed to insert data into the database")

raise Exception("No diseases found or failed to process variants")

# +++++++++++++++++++++++++++++++++++++++++++++++++
# Takes in Json file and returns csv file
disease_file = process_variants(result_file)

if not disease_file:
Expand Down
6 changes: 6 additions & 0 deletions backend/src/services/remote/database_service/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .xml_to_db import xml_to_db
from .json_to_db import json_to_db

__all__ = [
"xml_to_db",
]
Loading
Loading